diff --git a/.azure-pipelines/templates/Rust.Build.Job.yml b/.azure-pipelines/templates/Rust.Build.Job.yml index 596a0ca3..fc260be1 100644 --- a/.azure-pipelines/templates/Rust.Build.Job.yml +++ b/.azure-pipelines/templates/Rust.Build.Job.yml @@ -133,6 +133,15 @@ jobs: displayName: Build linux-test-proxy condition: eq('${{ item.os }}', 'linux') + # plm (Permissive Learning Mode) is functionally Windows-only, but it + # builds cross-platform via a no-op stub `fn main()` so it sits in the + # workspace default-members. Tests run on x64 only (arm64 emulation in + # 1ES windows-11-arm is too slow for the WPR-touching helpers). + - script: | + cargo test --release --target $(triplet) -p plm --manifest-path $(Build.SourcesDirectory)/src/Cargo.toml + displayName: Test plm + condition: and(eq('${{ item.os }}', 'windows'), eq('${{ item.arch }}', 'x64')) + - task: EsrpCodeSigning@5 displayName: Code Sign condition: and(eq(${{ parameters.isOfficialBuild }}, true), eq('${{ item.os }}', 'windows')) diff --git a/.github/workflows/Build.Linux.Job.yml b/.github/workflows/Build.Linux.Job.yml index 2c9a55b8..8f060b45 100644 --- a/.github/workflows/Build.Linux.Job.yml +++ b/.github/workflows/Build.Linux.Job.yml @@ -75,6 +75,21 @@ jobs: run: cargo test --locked --release --target ${{ matrix.target }} -p wxc_e2e_tests + # PLM (Permissive Learning Mode) is functionally Windows-only, but the + # crate builds cross-platform: the lib's helper modules compile on every + # target, and the binary has a no-op stub `fn main()` for non-Windows so + # the crate lives in the workspace `default-members` list. This gate + # keeps the cross-platform/Windows-only split a CI-enforced contract — if + # someone adds a `use windows::…` without a `#[cfg]` gate the Linux + # build fails fast rather than rotting silently. + - name: Build plm (cross-platform stub + lib) + working-directory: src + run: cargo build --locked --release --target ${{ matrix.target }} -p plm + + - name: Test plm (cross-platform modules) + working-directory: src + run: cargo test --locked --release --target ${{ matrix.target }} -p plm + # linux_test_proxy is a separate workspace member, not a dep of lxc. - name: Build linux-test-proxy working-directory: src diff --git a/README.md b/README.md index adf5654d..b77383f1 100644 --- a/README.md +++ b/README.md @@ -215,6 +215,16 @@ wxc-exec.exe --debug config.json See [docs/diagnostics.md](docs/diagnostics.md) for full diagnostics reference. +### Audit Mode (Permissive Learning Mode) + +`--audit` runs the policy in **permissive** mode — denied operations are logged but allowed to proceed — and starts a Permissive Learning Mode (PLM) ETW trace alongside the workload. See [src/core/plm/readme.md](src/core/plm/readme.md) for the full PLM tool reference, including standalone `plm.exe` invocation (e.g. re-processing an existing `.etl` with `plm stop --trace-file …`). + +```bash +wxc-exec.exe --audit policy.json +``` + +> **Warning:** In release builds, `--audit` relaxes the rejection of `permissiveLearningMode` — AppContainer restrictions are **not** enforced for the duration of the run. Use only for policy authoring. + ## Documentation | Document | Description | diff --git a/build.bat b/build.bat index cb7b170e..80a0f304 100644 --- a/build.bat +++ b/build.bat @@ -40,6 +40,10 @@ if "%BUILD_ALL%"=="0" if "%BUILD_ARCH%"=="" ( :: Build flags set "CARGO_FLAGS=--target" if "%BUILD_CONFIG%"=="release" set "CARGO_FLAGS=--release --target" +:: plm is a standalone Windows-only binary that does not consume any of the +:: workspace feature flags above, so it uses its own profile/target-only flags. +set "PLM_FLAGS=--target" +if "%BUILD_CONFIG%"=="release" set "PLM_FLAGS=--release --target" if "%WITH_NANVIX%"=="1" set "CARGO_FLAGS=--features microvm %CARGO_FLAGS%" if "%WITH_WSLC%"=="1" set "CARGO_FLAGS=--features wslc %CARGO_FLAGS%" if "%WITH_ISOLATION_SESSION%"=="1" set "CARGO_FLAGS=--features isolation_session %CARGO_FLAGS%" @@ -66,11 +70,14 @@ if not errorlevel 1 ( if "%BUILD_ALL%"=="1" ( echo Target: x86_64-pc-windows-msvc cargo build %CARGO_FLAGS% x86_64-pc-windows-msvc || goto :error + cargo build -p plm %PLM_FLAGS% x86_64-pc-windows-msvc || goto :error echo Target: aarch64-pc-windows-msvc cargo build %CARGO_FLAGS% aarch64-pc-windows-msvc || goto :error + cargo build -p plm %PLM_FLAGS% aarch64-pc-windows-msvc || goto :error ) else ( echo Target: %BUILD_ARCH% cargo build %CARGO_FLAGS% %BUILD_ARCH% || goto :error + cargo build -p plm %PLM_FLAGS% %BUILD_ARCH% || goto :error ) echo Check formatting cargo fmt --all -- --check || goto :error @@ -108,6 +115,10 @@ for %%T in (x86_64-pc-windows-msvc aarch64-pc-windows-msvc) do ( copy /Y "!BIN_DIR!\wxc-host-prep.exe" "sdk\bin\!SDK_ARCH!\" >nul echo Copied !SDK_ARCH!\wxc-host-prep.exe ) + if exist "!BIN_DIR!\plm.exe" ( + copy /Y "!BIN_DIR!\plm.exe" "sdk\bin\!SDK_ARCH!\" >nul + echo Copied !SDK_ARCH!\plm.exe + ) if "%WITH_NANVIX%"=="1" ( for %%B in (nanvixd.exe nanvix_rootfs.img python3.initrd) do ( if exist "!BIN_DIR!\%%B" ( @@ -147,6 +158,11 @@ popd echo. echo Building SDK integration tests... pushd sdk\tests\integration +:: npm caches `file:` deps by package.json version. The local SDK version +:: rarely bumps between builds, so a plain `npm install` keeps reusing the +:: stale packed copy. Force a refresh of the @microsoft/mxc-sdk link so +:: type-checking sees the dist we just rebuilt above. +if exist node_modules\@microsoft\mxc-sdk rmdir /s /q node_modules\@microsoft\mxc-sdk call npm install & call npm run build popd diff --git a/sdk/tests/integration/package-lock.json b/sdk/tests/integration/package-lock.json index 78345293..a110d07f 100644 --- a/sdk/tests/integration/package-lock.json +++ b/sdk/tests/integration/package-lock.json @@ -19,7 +19,7 @@ } }, "node_modules/@microsoft/mxc-sdk": { - "version": "0.6.1", + "version": "0.7.0", "resolved": "file:../..", "license": "MIT", "dependencies": { @@ -31,9 +31,9 @@ } }, "node_modules/@types/node": { - "version": "20.19.39", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.39.tgz", - "integrity": "sha512-orrrD74MBUyK8jOAD/r0+lfa1I2MO6I+vAkmAWzMYbCcgrN4lCrmK52gRFQq/JRxfYPfonkr4b0jcY7Olqdqbw==", + "version": "20.19.43", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.43.tgz", + "integrity": "sha512-6oYBAi5ikg4Pl+kGsoYtawUMBT2zZMCvPNF7pVLnHZfd1zf38DRiWn/gT01RYCdUqkv7Fhr+C9ot4/tb+2sVvA==", "dev": true, "license": "MIT", "dependencies": { @@ -89,9 +89,9 @@ } }, "node_modules/lru-cache": { - "version": "11.3.6", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.3.6.tgz", - "integrity": "sha512-Gf/KoL3C/MlI7Bt0PGI9I+TeTC/I6r/csU58N4BSNc4lppLBeKsOdFYkK+dX0ABDUMJNfCHTyPpzwwO21Awd3A==", + "version": "11.5.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.5.1.tgz", + "integrity": "sha512-RPimw/7aMdv2oqRrxKwvZXcPfwBrn/JZ2xYcY9Hus/6LaS3VOAKVWKWgNLCFSiOm1ESXinjsDlidVU7JlnCN2A==", "dev": true, "license": "BlueOak-1.0.0", "engines": { @@ -185,9 +185,9 @@ } }, "node_modules/semver": { - "version": "7.7.4", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", - "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "version": "7.8.5", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.8.5.tgz", + "integrity": "sha512-Y7/KDsb8LjooZpwaqGyulO6DQlksgCncchHGk+sZIY4SBvUocMBEFH5Ur1fI4dV+Jvl0w6cjvucaIi40puRioA==", "license": "ISC", "bin": { "semver": "bin/semver.js" diff --git a/src/Cargo.lock b/src/Cargo.lock index 0c693c46..2f5ca724 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1611,6 +1611,21 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" +[[package]] +name = "plm" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "clap", + "mxc_build_common", + "roxmltree", + "serde_json", + "tempfile", + "windows", + "wxc_common", +] + [[package]] name = "portable-atomic" version = "1.13.1" @@ -1749,6 +1764,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "roxmltree" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c20b6793b5c2fa6553b250154b78d6d0db37e72700ae35fad9387a46f487c97" + [[package]] name = "rust-embed" version = "8.11.0" @@ -2930,6 +2951,7 @@ dependencies = [ "nanvix_binaries", "nanvix_build_common", "nanvix_runner", + "plm", "serde_json", "windows", "windows_sandbox_common", diff --git a/src/Cargo.toml b/src/Cargo.toml index 84e7ba36..821fa71b 100644 --- a/src/Cargo.toml +++ b/src/Cargo.toml @@ -7,6 +7,7 @@ members = [ "core/mxc-sdk", "core/mxc_pty", "core/mxc_build_common", + "core/plm", "core/generated/base_container_specification", "backends/appcontainer/common", "backends/windows_sandbox/daemon", @@ -82,6 +83,8 @@ windows = { version = "0.62", features = [ "Win32_System_SystemServices", "Win32_System_SystemInformation", "Win32_System_JobObjects", + "Win32_Security_WinTrust", + "Win32_Security_Cryptography", ] } windows-core = "0.62" serde = { version = "1", features = ["derive"] } @@ -90,6 +93,8 @@ thiserror = "2" anyhow = "1" base64 = "0.22" clap = { version = "4", features = ["derive"] } +chrono = { version = "0.4", default-features = false, features = ["std", "clock"] } +roxmltree = "0.20" wxc_common = { path = "core/wxc_common" } appcontainer_common = { path = "backends/appcontainer/common" } windows_sandbox_common = { path = "backends/windows_sandbox/common" } diff --git a/src/backends/appcontainer/common/src/appcontainer_runner.rs b/src/backends/appcontainer/common/src/appcontainer_runner.rs index dee3a502..b71047ba 100644 --- a/src/backends/appcontainer/common/src/appcontainer_runner.rs +++ b/src/backends/appcontainer/common/src/appcontainer_runner.rs @@ -442,19 +442,15 @@ impl AppContainerScriptRunner { // --- Validate permissiveLearningMode --- for cap in &request.policy.capabilities { if cap == "permissiveLearningMode" { - #[cfg(debug_assertions)] - { + if request.audit { logger.log_line("*** SECURITY WARNING ***"); logger.log_line( "permissiveLearningMode is ENABLED. \ Container will learn and record access patterns.", ); - } - #[cfg(not(debug_assertions))] - { + } else { return Err(WxcError::Validation( - "SECURITY: permissiveLearningMode not allowed in release builds" - .to_string(), + "SECURITY: permissiveLearningMode requires --audit".to_string(), )); } } diff --git a/src/core/plm/Cargo.toml b/src/core/plm/Cargo.toml new file mode 100644 index 00000000..bf86dd5e --- /dev/null +++ b/src/core/plm/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "plm" +version = "0.1.0" +edition.workspace = true + +[lib] +name = "plm" +path = "src/lib.rs" + +[[bin]] +name = "plm" +path = "src/main.rs" + +[dependencies] +clap.workspace = true +anyhow.workspace = true +# Portable deps (config / access_event / event_parser) must compile on +# every target so their unit tests run in cross-platform CI. The +# `windows` crate stays target-gated below. +serde_json.workspace = true +chrono.workspace = true +roxmltree.workspace = true + +[target.'cfg(target_os = "windows")'.dependencies] +windows = { workspace = true, features = [ + "Win32_System_EventLog", + "Win32_System_SystemInformation", + "Win32_System_Threading", +] } +wxc_common = { workspace = true } + +[build-dependencies] +mxc_build_common.workspace = true + +[dev-dependencies] +tempfile.workspace = true diff --git a/src/core/plm/build.rs b/src/core/plm/build.rs new file mode 100644 index 00000000..7403ce9b --- /dev/null +++ b/src/core/plm/build.rs @@ -0,0 +1,8 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Build script for plm — embeds Windows VersionInfo. + +fn main() { + mxc_build_common::embed_version_info("MXC permissive learning mode", "plm.exe"); +} diff --git a/src/core/plm/readme.md b/src/core/plm/readme.md new file mode 100644 index 00000000..1fcbf4e7 --- /dev/null +++ b/src/core/plm/readme.md @@ -0,0 +1,83 @@ +# PLM — Permissive Learning Mode + +`plm.exe` is the Windows-only trace driver for permissive learning mode. Long-form, it captures the access-denied events emitted by Windows' permissive sandbox layer, decodes them into structured findings, and merges those findings back into an MXC container config so the next enforcing run succeeds. + +This PR introduces the **trace-lifecycle skeleton only**: WPR start/stop, the host-wide singleton mutex, the embedded `plm.wprp` materializer, and the `wxc-exec --audit` plumbing. Event parsing, capability extraction, filesystem/UI merging, and the adjusted-config writer arrive in subsequent PRs. + +PLM is invoked automatically by [`wxc-exec --audit`](../../../README.md#audit-mode-permissive-learning-mode); the standalone CLI documented here is for capturing traces, interactive iteration, and (later) debugging the parser itself. + +## How it works (skeleton) + +1. **Capture** — `plm start` calls `wpr -start !AccessFailureProfile -filemode`, enabling the `Microsoft-Windows-Privacy-Auditing-PermissiveLearningMode` and `Microsoft-Windows-Kernel-General` ETW providers in a secure realtime collector. +2. **Run** — the operator runs the workload. The OS-side permissive sandbox logs `EventID=14` / `EventID=27` for every access that *would* have been denied. +3. **Stop** — `plm stop` calls `wpr -stop ` and records the captured trace location. +4. **Parse / Merge** — *(arrives in later PRs)* the `.etl` is walked with `EvtQuery` / `EvtRender` and findings are merged into a copy of the input config as `Adjusted_.json`. + +## Layout (this PR) + +| File | Role | +|-----------------------|-------------------------------------------------------------------------------------| +| `src/main.rs` | `clap` dispatch for `plm start` / `plm stop` / `plm log` (`extract-caps` lands later) | +| `src/start.rs` | `wpr -cancel` (best-effort) + `wpr -start …!AccessFailureProfile -filemode` | +| `src/stop.rs` | `wpr -stop` (or skip with `--trace-file`); parse + merge arrive in later PRs | +| `src/log.rs` | Interactive mode: Enter to start, Enter to stop; preview arrives in later PRs | +| `src/coordination.rs` | Cross-process singleton named-mutex + bypass-env-var coordination for `plm log` | +| `src/wpr_path.rs` | Resolves `wpr.exe` to its absolute `%SystemRoot%\System32` path (PATH-spoof-safe) | +| `src/profile_gen.rs` | Inline WPR profile (`EMBEDDED_WPRP`) + run-time writer that drops `plm.wprp` next to `plm.exe` when missing | + +## CLI + +### `plm start` + +Cancels any in-progress WPR session and starts a new permissive-learning-mode trace. + +```powershell +plm.exe start [--wprp ] +``` + +| Flag | Default | Purpose | +|------------|------------------------|---------------------------------------------------------------| +| `--wprp` | `\plm.wprp` | Override the WPR profile path. By default `plm` materializes its embedded profile next to the exe on first use; an existing `plm.wprp` is never overwritten, so operator hand-edits are preserved. | + +### `plm stop` + +Stops the active trace (or accepts a previously captured one). + +```powershell +plm.exe stop [--config-path ] [--log-dir ] [--bin-path ] + [--adjusted-config-path ] [--trace-file ] + [--verbose-logging] +``` + +`--config-path` / `--adjusted-config-path` are accepted today so `wxc-exec --audit` can pass them through; the merge that consumes them arrives in subsequent PRs. + +### `plm log` + +Interactive iteration mode: press Enter to start a trace, run the workload, press Enter again to stop. The "diff against a blank config" preview arrives in later PRs. + +```powershell +plm.exe log [--wprp ] [--verbose-logging] +``` + +## Building + +PLM is part of the MXC workspace but excluded from `default-members` because it's Windows-only. Build it explicitly: + +```powershell +cd C:\src\mxc\src +cargo build -p plm --target x86_64-pc-windows-msvc +# or for release: +cargo build -p plm --target x86_64-pc-windows-msvc --release +``` + +The WPR profile is embedded into `plm.exe` itself (see `src/profile_gen.rs`); on first use of `plm start` / `plm log`, `profile_gen::ensure_wprp_next_to_exe` writes it to disk next to the binary if no `plm.wprp` is already present. `build.bat` from the repo root builds `plm.exe` and stages it next to `wxc-exec.exe` for the `--audit` integration. + +## Limitations + +- **Windows-only.** Uses `wpr.exe` and Job-Object UI-limit semantics that have no portable equivalent. +- **No parse-and-merge yet.** `plm stop` writes the captured `.etl` to the log directory but does not yet produce an `Adjusted_*.json`. Later PRs add file-path extraction, capability extraction, UI-policy extraction, and the adjusted-config writer. + +## See also + +- [`docs/base-process-container/guide.md`](../../../docs/base-process-container/guide.md) — process-container backend overview +- [README → Debugging → Audit Mode](../../../README.md#audit-mode-permissive-learning-mode) — `wxc-exec --audit` integration diff --git a/src/core/plm/src/coordination.rs b/src/core/plm/src/coordination.rs new file mode 100644 index 00000000..c3eceeb1 --- /dev/null +++ b/src/core/plm/src/coordination.rs @@ -0,0 +1,294 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Cross-process coordination primitives shared by `plm.exe` and the +//! `wxc-exec --audit` driver in the `wxc` crate. Centralises the +//! singleton bypass env-var name and the `wait_until_cleared` ctrl- +//! handler helper so the two binaries cannot drift apart and can both +//! exercise the same tested implementation. + +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::{Duration, Instant}; + +/// Set to `true` while a standalone `plm log` invocation is spawning +/// `wpr -start` and has not yet returned. Read by `plm.exe`'s console- +/// control handler so a Ctrl+C arriving in the spawn window is +/// bounded-waited for, instead of issuing `wpr -cancel` against a +/// not-yet-engaged kernel session and leaking it. Lifted into the +/// shared library (rather than living as a `static` inside +/// `plm/src/main.rs`) so the `log` module can flip it directly +/// without a callback round-trip. +pub static PLM_LOG_START_IN_FLIGHT: AtomicBool = AtomicBool::new(false); + +/// Maximum time either console-control handler will wait on its +/// in-flight flag (`AUDIT_START_IN_FLIGHT` in `wxc-exec`, +/// `PLM_LOG_START_IN_FLIGHT` in `plm.exe`) before falling through to +/// `wpr -cancel`. Shared between `wxc-exec`'s `dacl_ctrl_handler` +/// (which runs TWO bounded waits back-to-back — the DACL `try_lock` +/// drain and the `wait_until_cleared` call) and `plm.exe`'s +/// `plm_ctrl_handler` so the two binaries cannot drift apart. +/// Lifting the constant here makes drift a compile-time impossibility. +/// +/// The 2s budget is chosen so the combined budget of the wxc-exec +/// handler (`2 * CTRL_HANDLER_DRAIN_TIMEOUT`) stays under the +/// ~5s OS-imposed kill budget for `CTRL_CLOSE_EVENT` / +/// `CTRL_LOGOFF_EVENT` / `CTRL_SHUTDOWN_EVENT`, with ~500ms of +/// slack for the actual `wpr -cancel` spawn. Pinned by +/// `tests::ctrl_handler_drain_timeout_respects_os_budget`. +pub const CTRL_HANDLER_DRAIN_TIMEOUT: Duration = Duration::from_secs(2); + +/// Env var set by `wxc-exec --audit` before spawning `plm.exe`. When +/// present, the spawned `plm` binary skips its own singleton mutex +/// acquisition because the outer `wxc-exec` already holds it for the +/// whole audit window. Avoids a deadlock between parent and child on +/// the same `Global\Mxc_Plm_Audit` name. +pub const SINGLETON_HELD_BY_PARENT_ENV: &str = "MXC_PLM_AUDIT_SINGLETON_HELD"; + +/// True when the env-var set by the audit-driving parent process is +/// present. Extracted from `acquire_singleton_if_needed` so the +/// bypass branch is reachable from unit tests. +pub fn singleton_bypass_requested() -> bool { + std::env::var_os(SINGLETON_HELD_BY_PARENT_ENV).is_some() +} + +/// Spin until `flag` reads `false`, or `timeout` elapses. Polls every +/// `poll_interval`. Returns `true` if the flag cleared in time, +/// `false` on timeout. +/// +/// Used by both `wxc-exec`'s `dacl_ctrl_handler` (waiting for `plm +/// start` to drain before issuing `wpr -cancel`) and `plm.exe`'s +/// `plm_ctrl_handler`. +pub fn wait_until_cleared(flag: &AtomicBool, timeout: Duration, poll_interval: Duration) -> bool { + let deadline = Instant::now() + timeout; + while flag.load(Ordering::SeqCst) { + if Instant::now() >= deadline { + return false; + } + std::thread::sleep(poll_interval); + } + true +} + +/// Windows-only shared implementation of the `Global\Mxc_Plm_Audit` +/// named-mutex singleton. Both `plm.exe` and `wxc-exec --audit` +/// serialize on the same name so two concurrent PLM traces can't share +/// the single NT Kernel Logger session. +#[cfg(target_os = "windows")] +pub mod singleton { + use std::sync::atomic::{AtomicIsize, Ordering}; + + /// Outcome of `try_acquire`. Callers translate to their own error + /// type (anyhow / String / etc). + pub enum AcquireError { + /// Another process already holds the singleton mutex. + AlreadyHeld, + /// `CreateMutexW` failed for a non-conflict reason. + CreateFailed(windows::core::Error), + } + + /// Attempt to acquire the host-wide PLM audit mutex, stashing the + /// raw handle in `slot` so both `Drop`-based release and the + /// pre-`ExitProcess` cleanup can find it. + /// + /// Uses the `CreateMutexW` + `WaitForSingleObject(0)` two-step + /// pattern rather than `CreateMutexW(bInitialOwner=true)` so we + /// correctly detect the "previous owner crashed without + /// releasing" case (Windows surfaces this as `WAIT_ABANDONED_0` + /// on the wait, never on the create). Treating an abandoned + /// mutex as `AlreadyHeld` would leave a stale singleton forever + /// after any PLM crash and force operators to reboot; we instead + /// take ownership silently, since the abandoned wpr session (if + /// any) is torn down separately by the caller's normal cancel + /// path. + pub fn try_acquire(slot: &AtomicIsize) -> Result<(), AcquireError> { + use windows::core::w; + use windows::Win32::Foundation::{ + CloseHandle, WAIT_ABANDONED, WAIT_OBJECT_0, WAIT_TIMEOUT, + }; + use windows::Win32::System::Threading::{CreateMutexW, WaitForSingleObject}; + + let name = w!("Global\\Mxc_Plm_Audit"); + // Open (or create) the named mutex without requesting initial + // ownership; ownership is acquired via the wait below so we + // can distinguish "someone else holds it" from "previous + // owner crashed and we now own the abandoned mutex". + let handle = + unsafe { CreateMutexW(None, false, name) }.map_err(AcquireError::CreateFailed)?; + let wait = unsafe { WaitForSingleObject(handle, 0) }; + match wait { + WAIT_OBJECT_0 | WAIT_ABANDONED => { + // We now own the mutex (either freshly or by + // inheriting an abandoned one). + slot.store(handle.0 as isize, Ordering::SeqCst); + Ok(()) + } + WAIT_TIMEOUT => { + unsafe { + let _ = CloseHandle(handle); + } + Err(AcquireError::AlreadyHeld) + } + other => { + unsafe { + let _ = CloseHandle(handle); + } + // Prefer the OS's last-error (set by WAIT_FAILED); + // fall back to encoding the raw wait return as an + // HRESULT for exotic values. + let thread_err = windows::core::Error::from_thread(); + Err(AcquireError::CreateFailed(if thread_err.code().is_err() { + thread_err + } else { + windows::core::Error::from_hresult(windows::core::HRESULT(other.0 as i32)) + })) + } + } + } + + /// Release the singleton if `slot` holds a live handle. Idempotent: + /// safe to call from `Drop`, from explicit pre-`process::exit` + /// cleanup, and from error paths. + pub fn release(slot: &AtomicIsize) { + let raw = slot.swap(0, Ordering::SeqCst); + if raw != 0 { + let handle = windows::Win32::Foundation::HANDLE(raw as *mut _); + unsafe { + let _ = windows::Win32::System::Threading::ReleaseMutex(handle); + let _ = windows::Win32::Foundation::CloseHandle(handle); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Arc; + + // ---- singleton bypass ------------------------------------------------ + // + // The env-var lookup is process-global, so multiple tests racing + // on it would interfere. Serialise them with a module-local mutex. + // (We can't use `serial_test` without pulling in a new dep, and a + // bespoke mutex is sufficient for these two tests.) + use std::sync::Mutex; + static ENV_LOCK: Mutex<()> = Mutex::new(()); + + #[test] + fn singleton_bypass_requested_returns_false_when_env_unset() { + let _guard = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner()); + // SAFETY: the lock above serializes env mutation within this + // test binary. Other test binaries can't see this env var + // (Cargo runs each integration test in its own process), and + // production callers always inherit it from wxc-exec. + std::env::remove_var(SINGLETON_HELD_BY_PARENT_ENV); + assert!(!singleton_bypass_requested()); + } + + #[test] + fn singleton_bypass_requested_returns_true_when_env_set() { + let _guard = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner()); + std::env::set_var(SINGLETON_HELD_BY_PARENT_ENV, "1"); + let observed = singleton_bypass_requested(); + std::env::remove_var(SINGLETON_HELD_BY_PARENT_ENV); + assert!(observed); + } + + // the bypass also fires for any non-empty + // value (Windows env "0" is still set), so the parent only needs + // the env var to be present, not equal to "1". Pin that contract + // so a future refactor doesn't tighten the check. + #[test] + fn singleton_bypass_requested_returns_true_for_any_value() { + let _guard = ENV_LOCK.lock().unwrap_or_else(|p| p.into_inner()); + std::env::set_var(SINGLETON_HELD_BY_PARENT_ENV, ""); + let observed_empty = singleton_bypass_requested(); + std::env::set_var(SINGLETON_HELD_BY_PARENT_ENV, "0"); + let observed_zero = singleton_bypass_requested(); + std::env::remove_var(SINGLETON_HELD_BY_PARENT_ENV); + assert!(observed_empty, "empty string should still count as set"); + assert!(observed_zero, "\"0\" should still count as set"); + } + + // ---- ctrl-handler drain budget -------------------------------------- + + // Pin the OS-budget invariant. Windows imposes a hard ~5s kill + // timer on `CTRL_CLOSE_EVENT` / `CTRL_LOGOFF_EVENT` / + // `CTRL_SHUTDOWN_EVENT` handlers. The wxc-exec handler runs two + // back-to-back bounded waits each capped at + // `CTRL_HANDLER_DRAIN_TIMEOUT`, so `2 * CTRL_HANDLER_DRAIN_TIMEOUT` + // must stay under that budget with some slack for the actual + // `wpr -cancel` spawn that follows. A future bump to >2s + // reintroduces the ETW-session leak silently — this test fails + // the build instead. + #[test] + fn ctrl_handler_drain_timeout_respects_os_budget() { + let combined = CTRL_HANDLER_DRAIN_TIMEOUT + .checked_mul(2) + .expect("2 * timeout overflows"); + assert!( + combined <= Duration::from_millis(4500), + "2 * CTRL_HANDLER_DRAIN_TIMEOUT ({combined:?}) must stay under \ + the ~5s OS kill budget for CTRL_CLOSE/LOGOFF/SHUTDOWN, with \ + ~500ms slack for `wpr -cancel` to spawn" + ); + } + + // ---- wait_until_cleared --------------------------------------------- + + #[test] + fn wait_until_cleared_returns_true_when_flag_already_false() { + let flag = AtomicBool::new(false); + let started = Instant::now(); + assert!(wait_until_cleared( + &flag, + Duration::from_secs(5), + Duration::from_millis(10) + )); + // Should be effectively instantaneous (well under the timeout). + assert!( + started.elapsed() < Duration::from_millis(500), + "no-wait path must not sleep" + ); + } + + #[test] + fn wait_until_cleared_returns_false_on_timeout() { + let flag = AtomicBool::new(true); + let started = Instant::now(); + let result = + wait_until_cleared(&flag, Duration::from_millis(150), Duration::from_millis(20)); + assert!(!result, "timeout must surface as false"); + // Allow generous CI scheduling slop: must wait at least the + // timeout, but not wildly longer. + let elapsed = started.elapsed(); + assert!( + elapsed >= Duration::from_millis(140), + "must wait at least the timeout, waited {elapsed:?}" + ); + assert!( + elapsed < Duration::from_secs(2), + "must not vastly exceed the timeout, waited {elapsed:?}" + ); + } + + #[test] + fn wait_until_cleared_returns_true_when_flag_clears_mid_wait() { + let flag = Arc::new(AtomicBool::new(true)); + let writer_flag = Arc::clone(&flag); + // Clear the flag from a background thread after ~50ms; the + // wait should observe the change and return true well before + // the 5s timeout. + std::thread::spawn(move || { + std::thread::sleep(Duration::from_millis(50)); + writer_flag.store(false, Ordering::SeqCst); + }); + let started = Instant::now(); + let result = wait_until_cleared(&flag, Duration::from_secs(5), Duration::from_millis(10)); + assert!(result, "flag clearing mid-wait must surface as true"); + assert!( + started.elapsed() < Duration::from_secs(2), + "must observe the clear well before the timeout" + ); + } +} diff --git a/src/core/plm/src/lib.rs b/src/core/plm/src/lib.rs new file mode 100644 index 00000000..232085b2 --- /dev/null +++ b/src/core/plm/src/lib.rs @@ -0,0 +1,21 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Library surface for the permissive learning mode (PLM) crate. +//! Pure-data modules compile cross-platform; Windows-only items are +//! gated per-module. The `plm` binary in `main.rs` is Windows-only. + +pub mod coordination; +pub mod profile_gen; + +#[cfg(target_os = "windows")] +pub mod log; + +#[cfg(target_os = "windows")] +pub mod start; + +#[cfg(target_os = "windows")] +pub mod stop; + +#[cfg(target_os = "windows")] +pub mod wpr_path; diff --git a/src/core/plm/src/log.rs b/src/core/plm/src/log.rs new file mode 100644 index 00000000..d8ab38d1 --- /dev/null +++ b/src/core/plm/src/log.rs @@ -0,0 +1,88 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Interactive "logging" mode. +//! +//! 1. Prompts the user to press Enter to start logging. +//! 2. Starts a WPR trace (same `AccessFailureProfile` used by `start`). +//! 3. Prompts the user to press Enter to stop logging. +//! 4. Stops the trace into a temp .etl and reports where it landed. + +use anyhow::{Context, Result}; +use chrono::Local; +use std::io::{self, BufRead, Write}; +use std::path::{Path, PathBuf}; + +use crate::coordination::PLM_LOG_START_IN_FLIGHT; +use crate::start; +use crate::wpr_path::wpr_command; +use std::sync::atomic::Ordering; + +fn prompt_enter(message: &str) -> Result<()> { + print!("{message}"); + io::stdout().flush().ok(); + let stdin = io::stdin(); + let mut line = String::new(); + stdin + .lock() + .read_line(&mut line) + .context("failed to read from stdin")?; + Ok(()) +} + +fn stop_wpr_trace(trace_file: &Path) -> Result<()> { + // Capture stdio rather than inheriting so `wpr -stop`'s progress + // bar (`100% [>>>>>>>>>]`) and other chatter don't leak into any + // wrapping tool's stdout. On non-zero exit we replay the captured + // streams via the shared `replay_wpr_output` helper so operators + // can still see wpr's own diagnostic. + let output = wpr_command() + .args(["-stop", &trace_file.to_string_lossy()]) + .output() + .context("failed to spawn wpr -stop")?; + if !output.status.success() { + crate::start::replay_wpr_output("stop", &output); + anyhow::bail!("wpr -stop exited with {}", output.status); + } + Ok(()) +} + +pub fn run( + wprp_path: &Path, + verbose: bool, + on_trace_started: impl FnOnce(), + on_trace_stopped: impl FnOnce(), +) -> Result<()> { + prompt_enter("Press Enter to start logging...")?; + // Bracket the `wpr -start` spawn so the console-control handler + // in `plm/src/main.rs` waits for it to drain before deciding + // whether to issue `wpr -cancel`. Closes the same race the + // wxc-exec side closes with `AUDIT_START_IN_FLIGHT`. + PLM_LOG_START_IN_FLIGHT.store(true, Ordering::SeqCst); + let start_result = start::start_plm_trace(wprp_path); + PLM_LOG_START_IN_FLIGHT.store(false, Ordering::SeqCst); + start_result?; + // `wpr -start` has engaged the kernel session. Only NOW mark the + // trace active so a stdin-EOF / spawn-fail before this point can't + // trip the Ctrl+C handler into `wpr -cancel`ing an unrelated host + // WPR session. + on_trace_started(); + println!("Logging started."); + + prompt_enter("Press Enter to stop logging...")?; + + // Per-run trace file in temp; PID + sub-second component prevents + // parallel `plm log` invocations from colliding on the same .etl. + let stamp = Local::now().format("%Y-%m-%d_%H%M%S%.3f").to_string(); + let trace_file: PathBuf = std::env::temp_dir().join(format!("plm_log_{stamp}.etl")); + stop_wpr_trace(&trace_file)?; + // Kernel session is torn down; safe to clear the active flag so + // any subsequent Ctrl+C doesn't issue a stale `wpr -cancel`. + on_trace_stopped(); + + println!("Trace captured at {}.", trace_file.display()); + if verbose { + println!("verbose logging requested."); + } + Ok(()) +} diff --git a/src/core/plm/src/main.rs b/src/core/plm/src/main.rs new file mode 100644 index 00000000..a747c8b1 --- /dev/null +++ b/src/core/plm/src/main.rs @@ -0,0 +1,385 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Rust port of the permissive learning mode (PLM) PowerShell scripts. +//! +//! Subcommands: +//! - `start`: cancel any active WPR trace and start a new one using +//! `plm.wprp!AccessFailureProfile`. +//! - `stop`: stop the trace and write `trace.etl` into a log directory. +//! - `log`: interactive — Enter to start, Enter to stop. +//! +//! The functional binary wraps WPR / ETW / EventLog APIs that have no +//! cross-platform equivalent and is therefore Windows-only. On +//! Linux/macOS we still compile a stub binary so the crate sits inside +//! the workspace `default-members` list (one members list to maintain, +//! cross-platform CI catches drift); invoking it prints a message and +//! exits non-zero. + +#[cfg(not(target_os = "windows"))] +fn main() { + eprintln!("plm is Windows-only; this stub binary does nothing on non-Windows targets."); + std::process::exit(1); +} + +#[cfg(target_os = "windows")] +use anyhow::{Context, Result}; +#[cfg(target_os = "windows")] +use clap::{Parser, Subcommand}; +#[cfg(target_os = "windows")] +use std::path::PathBuf; +#[cfg(target_os = "windows")] +use std::sync::atomic::{AtomicBool, AtomicIsize, Ordering}; +#[cfg(target_os = "windows")] +use std::time::Duration; + +#[cfg(target_os = "windows")] +use plm::coordination::{singleton_bypass_requested, wait_until_cleared, PLM_LOG_START_IN_FLIGHT}; +#[cfg(target_os = "windows")] +use plm::{log, profile_gen, start, stop}; + +/// Raw `HANDLE` value of the named-mutex singleton acquired by +/// `acquire_singleton_if_needed` (zero when unheld). Stashed in a +/// static so the console-control handler can release the host-wide +/// `Global\Mxc_Plm_Audit` guard before `ExitProcess` runs and skips +/// Rust destructors, preventing the retry-on-conflict path in +/// `start_plm_trace` from `wpr -cancel`ing a peer PLM trace. +#[cfg(target_os = "windows")] +static PLM_SINGLETON_HANDLE: AtomicIsize = AtomicIsize::new(0); + +/// Backing storage for `AcquiredSingleton::mark_trace_active` / +/// `clear_trace_active` / `cancel_active_trace`. +/// +/// Kept as a process-wide `static` (not an owned field of +/// `AcquiredSingleton`) for one narrow reason: the Windows console- +/// control handler `plm_ctrl_handler` is an OS-owned `extern "system"` +/// callback with no `self` / captured environment. It can only reach +/// state via process globals. Access from the `main` thread, however, +/// is gated behind `&AcquiredSingleton` methods so it is a +/// compile-time invariant that the trace-active flag can only be +/// mutated while we hold the host-wide singleton mutex — you can't +/// call `mark_trace_active()` in a free function without first +/// producing an `AcquiredSingleton`. +#[cfg(target_os = "windows")] +static PLM_TRACE_ACTIVE: AtomicBool = AtomicBool::new(false); + +/// Release the named-mutex singleton if held. Idempotent. +#[cfg(target_os = "windows")] +fn release_plm_singleton() { + plm::coordination::singleton::release(&PLM_SINGLETON_HANDLE); +} + +/// Cancel any active PLM trace from a context that can't produce an +/// `&AcquiredSingleton` — currently just the ctrl handler, which +/// runs in an OS-owned callback with no captured environment. All +/// non-signal-context callers should use +/// `AcquiredSingleton::cancel_active_trace(&self)` instead so the +/// call site proves the singleton is held. +#[cfg(target_os = "windows")] +fn cancel_active_plm_trace_from_signal() { + if PLM_TRACE_ACTIVE.swap(false, Ordering::SeqCst) { + // Use the kernel-published System32 path. + let _ = plm::wpr_path::wpr_command() + .arg("-cancel") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status(); + } +} + +/// RAII wrapper for the host-wide `Global\Mxc_Plm_Audit` singleton. +/// Ownership of the singleton is the precondition for touching the +/// trace-active flag — the methods below take `&self` so a live +/// `AcquiredSingleton` must exist at every call site. +#[cfg(target_os = "windows")] +struct AcquiredSingleton; + +#[cfg(target_os = "windows")] +impl AcquiredSingleton { + /// Mark the kernel ETW session as live; called immediately after + /// `start::start_plm_trace` succeeds. + fn mark_trace_active(&self) { + PLM_TRACE_ACTIVE.store(true, Ordering::SeqCst); + } + + /// Clear the trace-active flag; called after `wpr -stop` drains + /// the kernel session so a subsequent Ctrl+C doesn't issue a + /// stale `wpr -cancel`. + fn clear_trace_active(&self) { + PLM_TRACE_ACTIVE.store(false, Ordering::SeqCst); + } + + /// Issue `wpr -cancel` iff a trace was marked active by this + /// process. Idempotent. Non-signal-context callers use this + /// method; the ctrl handler uses `cancel_active_plm_trace_from_signal`. + fn cancel_active_trace(&self) { + cancel_active_plm_trace_from_signal(); + } +} + +#[cfg(target_os = "windows")] +impl Drop for AcquiredSingleton { + fn drop(&mut self) { + // Cancel any leftover trace before releasing the singleton so + // a caller that returns an error mid-flow can't leak the + // kernel session past our exit. + self.cancel_active_trace(); + release_plm_singleton(); + } +} + +#[cfg(target_os = "windows")] +fn acquire_singleton_if_needed() -> Result> { + if singleton_bypass_requested() { + // Outer process holds the mutex for the whole audit window; + // re-acquiring here would deadlock. + return Ok(None); + } + use plm::coordination::singleton::{try_acquire, AcquireError}; + match try_acquire(&PLM_SINGLETON_HANDLE) { + Ok(()) => Ok(Some(AcquiredSingleton)), + Err(AcquireError::AlreadyHeld) => anyhow::bail!( + "another PLM trace is already in progress (Global\\Mxc_Plm_Audit held); \ + refusing to start a second concurrent trace — only one NT Kernel Logger \ + session can exist per host" + ), + Err(AcquireError::CreateFailed(e)) => { + Err(e).context("CreateMutexW failed for Global\\Mxc_Plm_Audit") + } + } +} + +/// Windows console-control handler. Fires on Ctrl+C, Ctrl+Break, +/// console close, logoff, and shutdown. Tears down any in-flight WPR +/// session and releases the singleton mutex before the default handler +/// calls `ExitProcess` (which skips Rust destructors). +/// +/// We poll `PLM_LOG_START_IN_FLIGHT` via `wait_until_cleared` instead +/// of a proper wait-object (Event / condvar) for two reasons: +/// 1. `wpr -start`'s underlying kernel session engagement isn't +/// signalled by any OS-published handle we can wait on; the only +/// transition we can observe is the child `wpr.exe` process +/// returning. Wrapping a Rust `Event` around that in the ctrl +/// handler would still require polling / a spawn-time helper +/// thread purely to `SetEvent`. +/// 2. The polling interval (50ms) is bounded above by +/// `CTRL_HANDLER_DRAIN_TIMEOUT` (2s) which is well under +/// Windows's ~5s ctrl-handler kill budget, so at most ~40 polls +/// fire — negligible CPU, zero cost on the happy path (the flag +/// is normally already clear when the handler runs). +#[cfg(target_os = "windows")] +unsafe extern "system" fn plm_ctrl_handler(_ctrl_type: u32) -> windows::core::BOOL { + // if `plm log`'s `wpr -start` is + // still in flight when Ctrl+C arrives, briefly wait for it to + // settle before deciding whether to issue `wpr -cancel`. Without + // this wait, a cancel that races a not-yet-engaged session is a + // no-op and the kernel session leaks past `plm.exe` exit. + // + // timeout sourced from the + // shared `plm::coordination::CTRL_HANDLER_DRAIN_TIMEOUT` so + // `plm.exe` and `wxc-exec`'s `dacl_ctrl_handler` cannot drift + // apart. The const docs explain the ~5s OS kill budget rationale. + // Polls via the shared `wait_until_cleared` helper so the same + // loop is tested in one place — see `plm::coordination::tests`. + let _ = wait_until_cleared( + &PLM_LOG_START_IN_FLIGHT, + plm::coordination::CTRL_HANDLER_DRAIN_TIMEOUT, + Duration::from_millis(50), + ); + cancel_active_plm_trace_from_signal(); + release_plm_singleton(); + // Return FALSE so the default handler still runs and terminates + // the process. Matches `wxc-exec`'s dacl_ctrl_handler pattern. + windows::core::BOOL(0) +} + +#[cfg(target_os = "windows")] +fn install_ctrl_handler() { + use windows::Win32::System::Console::SetConsoleCtrlHandler; + // SAFETY: handler has the correct ABI; Add=TRUE merely appends to + // the OS handler chain. + let _ = unsafe { SetConsoleCtrlHandler(Some(plm_ctrl_handler), true) }; +} + +#[derive(Parser, Debug)] +#[command( + name = "plm", + about = "Rust port of the permissive learning mode PowerShell scripts.", + version +)] +#[cfg(target_os = "windows")] +struct Cli { + #[command(subcommand)] + cmd: Cmd, +} + +#[derive(Subcommand, Debug)] +#[cfg(target_os = "windows")] +enum Cmd { + /// Start a new WPR trace using plm.wprp!AccessFailureProfile. + Start { + /// Override path to plm.wprp. Defaults to \plm.wprp. + #[arg(long)] + wprp: Option, + }, + /// Stop the trace and write `trace.etl` into a log directory. + Stop { + /// Directory for trace.etl, copied input config, and Adjusted_*.json. + #[arg(long)] + log_dir: Option, + /// Path treated as the application binary's location. Defaults + /// to the directory containing the plm executable. + #[arg(long)] + bin_path: Option, + /// Path to the MXC container config (JSON) to update. + #[arg(long)] + config_path: Option, + /// Override for the adjusted config output path. + #[arg(long)] + adjusted_config_path: Option, + /// Re-process a previously captured .etl instead of stopping a + /// live WPR session. When set, `wpr -stop` is skipped and the + /// supplied file is parsed as-is. + #[arg(long)] + trace_file: Option, + /// Emit per-event/per-ACE diagnostic output. + #[arg(long)] + verbose_logging: bool, + }, + /// Interactive: press Enter to start logging, press Enter again to stop. + Log { + /// Override path to plm.wprp. Defaults to \plm.wprp. + #[arg(long)] + wprp: Option, + /// Emit per-event/per-ACE diagnostic output. + #[arg(long)] + verbose_logging: bool, + }, +} + +#[cfg(target_os = "windows")] +fn exe_dir() -> Result { + let exe = std::env::current_exe().context("failed to resolve current exe path")?; + Ok(exe + .parent() + .map(|p| p.to_path_buf()) + .unwrap_or_else(|| PathBuf::from("."))) +} + +#[cfg(target_os = "windows")] +fn main() -> Result<()> { + let cli = Cli::parse(); + let exe = exe_dir()?; + + // Confirm the resolved wpr.exe exists at `%SystemDirectory%` + // before we go further. We rely on `GetSystemDirectoryW` (not + // env-spoofable) plus the OS TrustedInstaller ACL on that + // directory as the trust boundary; see `wpr_path` module docs for + // why we do not run WinVerifyTrust on the resolved binary. + plm::wpr_path::verify_wpr_signed().map_err(|e| anyhow::anyhow!("wpr.exe check failed: {e}"))?; + + // Install the Ctrl+C handler unconditionally so signals during any + // subcommand (in particular interactive `log`) tear down the WPR + // session and release the singleton before ExitProcess fires. + install_ctrl_handler(); + + match cli.cmd { + Cmd::Start { wprp } => { + let _singleton = acquire_singleton_if_needed()?; + // Default: materialize the embedded `plm.wprp` next to the + // exe if one isn't already there. + let wprp_path = match wprp { + Some(p) => p, + None => profile_gen::ensure_wprp_next_to_exe(&exe) + .context("failed to stage plm.wprp next to plm.exe")?, + }; + start::start_plm_trace(&wprp_path)?; + // `plm start` exits immediately and leaves the kernel ETW + // session running until a later `plm stop` / `wpr -stop`. + // We deliberately do NOT mark PLM_TRACE_ACTIVE here: this + // process is about to exit and can't be the one to cancel + // the session it just kicked off. The matching `plm stop` + // (or wxc-exec's `cancel_active_audit_trace` cleanup path + // on Ctrl+C) is what owns teardown. + Ok(()) + } + Cmd::Stop { + log_dir, + bin_path, + config_path, + adjusted_config_path, + trace_file, + verbose_logging, + } => { + let _singleton = acquire_singleton_if_needed()?; + stop::run( + stop::StopOptions { + log_dir, + bin_path, + config_path, + adjusted_config_path, + trace_file, + verbose: verbose_logging, + }, + &exe, + ) + } + Cmd::Log { + wprp, + verbose_logging, + } => { + let singleton = acquire_singleton_if_needed()?; + // see `Cmd::Start` above — stage the embedded profile if + // missing. + let wprp_path = match wprp { + Some(p) => p, + None => profile_gen::ensure_wprp_next_to_exe(&exe) + .context("failed to stage plm.wprp next to plm.exe")?, + }; + // The interactive `log` flow is the only standalone path + // that holds a live trace inside a single process. We hand + // `log::run` closures that call + // `AcquiredSingleton::mark_trace_active` / + // `clear_trace_active` on the borrowed singleton — the + // `&AcquiredSingleton` methods encode at compile time that + // trace-active can only be set while we hold the host-wide + // singleton mutex. `mark_trace_active` flips the flag only + // AFTER `wpr -start` has actually engaged the kernel + // session, so a stdin-EOF or spawn-fail before that point + // cannot trip the Ctrl+C handler into `wpr -cancel`ing an + // unrelated host WPR session. + let result = if let Some(s) = singleton.as_ref() { + log::run( + &wprp_path, + verbose_logging, + || s.mark_trace_active(), + || s.clear_trace_active(), + ) + } else { + // Singleton bypass path (wxc-exec --audit already + // holds the mutex). No `AcquiredSingleton` exists in + // this process, so we can't gate the flag on it — + // fall back to the free-function path that the ctrl + // handler also uses. The outer process owns cleanup. + log::run( + &wprp_path, + verbose_logging, + || PLM_TRACE_ACTIVE.store(true, Ordering::SeqCst), + || PLM_TRACE_ACTIVE.store(false, Ordering::SeqCst), + ) + }; + // If `log::run` returned Err AND the trace had been marked + // active (start succeeded but stop or later step failed), + // the flag is still set — issue `wpr -cancel` so the NT + // Kernel Logger session doesn't leak until reboot. + if result.is_err() { + if let Some(s) = singleton.as_ref() { + s.cancel_active_trace(); + } else { + cancel_active_plm_trace_from_signal(); + } + } + result + } + } +} diff --git a/src/core/plm/src/profile_gen.rs b/src/core/plm/src/profile_gen.rs new file mode 100644 index 00000000..cccbe98a --- /dev/null +++ b/src/core/plm/src/profile_gen.rs @@ -0,0 +1,208 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Materialize `plm.wprp` next to the running `plm` binary on demand. +//! +//! The canonical profile lives inline below as `EMBEDDED_WPRP`. There +//! is no checked-in `plm.wprp` file and no build-time staging — the +//! binary writes the file itself on first use of `plm start` / +//! `plm log` when one isn't already next to the exe. + +use std::io::{self, Write}; +use std::path::{Path, PathBuf}; +use std::process; + +/// Canonical WPR profile. Edited here directly — there is no +/// sibling `plm.wprp` file to keep in sync. `start.rs`'s +/// `plm_wprp_resource_is_well_formed_…` test pins the parser +/// contract on these exact bytes. +pub const EMBEDDED_WPRP: &str = r#" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +"#; + +/// Default filename for the staged profile. Lowercase to match what +/// `main.rs` defaults to (case-sensitive trees fail opaquely otherwise). +pub const WPRP_FILENAME: &str = "plm.wprp"; + +/// Ensure `plm.wprp` exists in `exe_dir`. If a file is already +/// present there, leave it untouched (an operator may have hand- +/// edited it) and return its path. Otherwise atomically write +/// `EMBEDDED_WPRP` to that path and return it. +/// +/// Atomic write: stage into `plm.wprp.tmp.`, fsync, then +/// `rename` over `plm.wprp`. This prevents a partial write (disk +/// full, AV hold, Ctrl+C, OS-budget kill) from leaving an empty or +/// truncated `plm.wprp` that every later run would silently adopt +/// via the early-return existence check. The temp file is removed +/// on any error path before the rename. +pub fn ensure_wprp_next_to_exe(exe_dir: &Path) -> io::Result { + let dst = exe_dir.join(WPRP_FILENAME); + if dst.exists() { + return Ok(dst); + } + let tmp = exe_dir.join(format!("{}.tmp.{}", WPRP_FILENAME, process::id())); + match write_then_rename(&tmp, &dst) { + Ok(()) => Ok(dst), + Err(e) => { + // Best-effort cleanup; ignore secondary errors so the + // caller sees the original failure. + let _ = std::fs::remove_file(&tmp); + // Lost a race with a concurrent staging: adopt the + // winner's copy rather than fail. + if e.kind() == io::ErrorKind::AlreadyExists && dst.exists() { + return Ok(dst); + } + Err(e) + } + } +} + +fn write_then_rename(tmp: &Path, dst: &Path) -> io::Result<()> { + let mut f = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(tmp)?; + f.write_all(EMBEDDED_WPRP.as_bytes())?; + f.sync_all()?; + drop(f); + // Byte-compare the staged file against the compile-time source of + // truth. Detects filter drivers, disk-quota clamps, and AV rewrites + // that would otherwise let a corrupted `plm.wprp` slip through + // `write_all` + `sync_all` and be adopted on every subsequent run + // via the early-return existence check in `ensure_wprp_next_to_exe`. + let staged = std::fs::read(tmp)?; + if staged != EMBEDDED_WPRP.as_bytes() { + // Best-effort cleanup; propagate the integrity error, not the + // secondary remove failure. + let _ = std::fs::remove_file(tmp); + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "staged plm.wprp does not match embedded profile", + )); + } + // `rename` over a nonexistent dst is atomic on Windows + Unix. + // If another writer beat us to it, surface AlreadyExists so the + // caller can adopt the winner. + match std::fs::rename(tmp, dst) { + Ok(()) => Ok(()), + Err(e) if dst.exists() => Err(io::Error::new( + io::ErrorKind::AlreadyExists, + format!("plm.wprp already exists: {e}"), + )), + Err(e) => Err(e), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn embedded_wprp_declares_access_failure_profile() { + assert!(EMBEDDED_WPRP.contains(" = std::fs::read_dir(tmp.path()) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| e.file_name().to_string_lossy().starts_with("plm.wprp.tmp.")) + .collect(); + assert!( + leftovers.is_empty(), + "stale tmp files: {:?}", + leftovers.iter().map(|e| e.file_name()).collect::>() + ); + } +} diff --git a/src/core/plm/src/start.rs b/src/core/plm/src/start.rs new file mode 100644 index 00000000..0e329b24 --- /dev/null +++ b/src/core/plm/src/start.rs @@ -0,0 +1,277 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! `plm start` — start a WPR trace using the `AccessFailureProfile` +//! defined in `profile_gen::EMBEDDED_WPRP` (materialized to disk next +//! to `plm.exe` by `profile_gen`). If a pre-existing WPR session +//! blocks our start, we cancel it and retry exactly once. + +use anyhow::Result; +use std::path::Path; +use std::process::{ExitStatus, Stdio}; + +use crate::wpr_path::wpr_command; + +/// Trait for testable `wpr.exe` start/cancel invocations. Tests +/// supply a fake that returns canned exit codes; production uses +/// `WprExe`. +pub trait WprLauncher { + fn start(&mut self, profile_arg: &str) -> Result; + fn cancel(&mut self); +} + +pub struct WprExe; + +impl WprLauncher for WprExe { + fn start(&mut self, profile_arg: &str) -> Result { + // Surface the resolved wpr.exe path in the spawn-failure + // context so hosts missing the Windows Performance Toolkit + // (e.g. stripped Server SKUs) get an actionable hint instead + // of a bare `os error 2`. + // + // Capture wpr.exe's stdout/stderr rather than inheriting them + // (via `.status()`) so a successful `wpr -start` doesn't + // pollute the console of any wrapping tool (e.g. wxc-exec + // --audit); on non-zero exit we replay the captured streams + // so operators can still diagnose real failures. + let cmd = wpr_command(); + let resolved = cmd.get_program().to_string_lossy().into_owned(); + let output = wpr_command() + .args(["-start", profile_arg, "-filemode"]) + .output() + .map_err(|e| describe_wpr_spawn_error("start", &resolved, e))?; + if !output.status.success() { + replay_wpr_output("start", &output); + } + Ok(output.status) + } + fn cancel(&mut self) { + cancel_existing_wpr_trace(); + } +} + +/// Wrap a `wpr.exe` spawn `io::Error` with the resolved absolute path +/// so failures are actionable (`wpr.exe not present at — +/// install the Windows Performance Toolkit`). +fn describe_wpr_spawn_error(verb: &str, resolved: &str, e: std::io::Error) -> anyhow::Error { + if e.kind() == std::io::ErrorKind::NotFound { + anyhow::anyhow!( + "failed to spawn wpr -{verb}: {e} (resolved path: {resolved}). \ + The Windows Performance Recorder (wpr.exe) is required for PLM \ + tracing; install the Windows Performance Toolkit (part of the \ + Windows ADK) and ensure {resolved} is present.", + ) + } else { + anyhow::anyhow!("failed to spawn wpr -{verb} ({resolved}): {e}",) + } +} + +/// Replay captured wpr.exe stdout/stderr to the caller's own streams. +/// Used only on failure paths — the happy path stays silent so PLM +/// can be embedded in wrappers (e.g. `wxc-exec --audit`) without +/// polluting their console. +pub(crate) fn replay_wpr_output(verb: &str, output: &std::process::Output) { + use std::io::Write as _; + if !output.stdout.is_empty() { + let _ = std::io::stdout().write_all(&output.stdout); + } + if !output.stderr.is_empty() { + let _ = std::io::stderr().write_all(&output.stderr); + } + eprintln!("[plm] wpr -{verb} exited with {}", output.status); +} + +/// Cancel any pre-existing in-memory WPR session before starting a +/// new one. Returns non-zero when no session was active — we ignore +/// the exit code and silence output. +/// +/// Only one NT Kernel Logger session can exist host-wide, so this +/// necessarily terminates any concurrent recording (PLM's previous +/// run or an unrelated tool); we log a warning to stderr. +/// +/// We deliberately do NOT gate this on `wpr -status` — its English- +/// only stdout match breaks on every localized Windows install. +/// Cancel is invoked only on the retry path after `wpr -start` +/// itself reports a conflict (locale-invariant). +pub fn cancel_existing_wpr_trace() { + eprintln!( + "[plm] cancelling pre-existing WPR session via `wpr -cancel`; \ + any concurrent non-PLM WPR recording on this host has just been terminated. \ + (Only one NT Kernel Logger session can exist at a time.)" + ); + let _ = wpr_command() + .arg("-cancel") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status(); +} + +/// Core try-then-cancel-then-retry state machine, parameterised on a +/// `WprLauncher` so tests can drive the conflict + retry branches +/// deterministically. +pub fn start_plm_trace_with(launcher: &mut L, wprp_path: &Path) -> Result<()> { + let arg = format!("{}!AccessFailureProfile", wprp_path.display()); + let first = launcher.start(&arg)?; + if first.success() { + return Ok(()); + } + launcher.cancel(); + let second = launcher.start(&arg)?; + if !second.success() { + anyhow::bail!( + "wpr -start exited with {second} (also failed after retry following wpr -cancel)" + ); + } + Ok(()) +} + +pub fn start_plm_trace(wprp_path: &Path) -> Result<()> { + start_plm_trace_with(&mut WprExe, wprp_path) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::os::windows::process::ExitStatusExt; + use std::path::PathBuf; + + struct FakeLauncher { + starts: Vec, + idx: usize, + cancels: usize, + } + impl FakeLauncher { + fn new(codes: &[u32]) -> Self { + Self { + starts: codes.iter().map(|c| ExitStatus::from_raw(*c)).collect(), + idx: 0, + cancels: 0, + } + } + } + impl WprLauncher for FakeLauncher { + fn start(&mut self, _arg: &str) -> Result { + let s = self.starts[self.idx]; + self.idx += 1; + Ok(s) + } + fn cancel(&mut self) { + self.cancels += 1; + } + } + + #[test] + fn start_plm_trace_first_attempt_success_does_not_cancel() { + let mut f = FakeLauncher::new(&[0]); + start_plm_trace_with(&mut f, &PathBuf::from("plm.wprp")).unwrap(); + assert_eq!(f.idx, 1); + assert_eq!(f.cancels, 0); + } + + #[test] + fn start_plm_trace_retries_once_after_conflict() { + // First attempt fails (non-zero), cancel runs, second succeeds. + let mut f = FakeLauncher::new(&[1, 0]); + start_plm_trace_with(&mut f, &PathBuf::from("plm.wprp")).unwrap(); + assert_eq!(f.idx, 2); + assert_eq!(f.cancels, 1); + } + + #[test] + fn start_plm_trace_propagates_when_retry_also_fails() { + let mut f = FakeLauncher::new(&[1, 1]); + let err = start_plm_trace_with(&mut f, &PathBuf::from("plm.wprp")).unwrap_err(); + assert!(format!("{err}").contains("failed after retry")); + assert_eq!(f.idx, 2); + assert_eq!(f.cancels, 1); + } + + /// when wpr.exe isn't on the system + /// (e.g. Server SKU without WPT), the spawn-failure context must + /// surface the resolved path AND nudge the operator toward + /// installing the Windows Performance Toolkit. Asserting against + /// a real spawn isn't deterministic on CI, so drive the formatter + /// directly with a synthesized NotFound `io::Error`. + #[test] + fn wpr_spawn_not_found_error_is_actionable() { + let err = describe_wpr_spawn_error( + "start", + "C:\\Windows\\System32\\wpr.exe", + std::io::Error::new(std::io::ErrorKind::NotFound, "the system cannot find"), + ); + let s = format!("{err}"); + assert!( + s.contains("C:\\Windows\\System32\\wpr.exe"), + "error must surface resolved wpr path: {s}", + ); + assert!( + s.contains("Windows Performance Toolkit") || s.contains("Windows ADK"), + "error must hint at WPT install: {s}", + ); + } + + #[test] + fn wpr_spawn_other_error_keeps_path_in_context() { + let err = describe_wpr_spawn_error( + "stop", + "C:\\Windows\\System32\\wpr.exe", + std::io::Error::new(std::io::ErrorKind::PermissionDenied, "access denied"), + ); + let s = format!("{err}"); + assert!(s.contains("C:\\Windows\\System32\\wpr.exe"), "got: {s}"); + assert!(s.contains("stop"), "verb must appear: {s}"); + } + + /// Pin that the embedded WPR profile (`profile_gen::EMBEDDED_WPRP`) + /// is well-formed XML and still declares the `AccessFailureProfile` + /// recording referenced by `start_plm_trace_with`. The profile is + /// no longer a separate file — it lives as a raw string in + /// `profile_gen.rs` — so this test is the only schema gate. + #[test] + fn plm_wprp_resource_is_well_formed_and_declares_access_failure_profile() { + let wprp = crate::profile_gen::EMBEDDED_WPRP; + let doc = + roxmltree::Document::parse(wprp).expect("EMBEDDED_WPRP must parse as well-formed XML"); + + // The recording name must stay `AccessFailureProfile` — + // `start_plm_trace_with` builds `!AccessFailureProfile`. + let has_profile = doc + .descendants() + .filter(|n| n.has_tag_name("Profile")) + .any(|n| n.attribute("Name") == Some("AccessFailureProfile")); + assert!( + has_profile, + "EMBEDDED_WPRP must declare a \ + element — the runtime hard-codes this name in start_plm_trace", + ); + + // The harness depends on the Privacy-Auditing-PLM event + // provider for its event-id=14 / event-id=27 detection paths. + // Sanity-check that the profile still references it; dropping + // it (by name OR GUID) silently disables every meaningful + // detection. + let mentions_plm_provider = wprp + .contains("Microsoft-Windows-Privacy-Auditing-PermissiveLearningMode") + || wprp.contains("811a1ddb-2e69-5f25-adc0-4b186170e760"); + assert!( + mentions_plm_provider, + "EMBEDDED_WPRP must enable the Microsoft-Windows-Privacy-Auditing-PermissiveLearningMode \ + provider (GUID 811a1ddb-2e69-5f25-adc0-4b186170e760); without it the \ + event-id=14/27 detection pipeline has nothing to consume", + ); + + // The profile also wires the kernel collector for process/loader + // events the parser uses to attribute access failures to a + // specific application binary. Verify the collector reference + // still exists. + let has_kernel_collector = doc + .descendants() + .filter(|n| n.has_tag_name("SystemCollector")) + .any(|n| n.attribute("Id") == Some("SC_Kernel")); + assert!( + has_kernel_collector, + "EMBEDDED_WPRP must declare the SC_Kernel SystemCollector that the \ + AccessFailureProfile recording references", + ); + } +} diff --git a/src/core/plm/src/stop.rs b/src/core/plm/src/stop.rs new file mode 100644 index 00000000..0f01d02c --- /dev/null +++ b/src/core/plm/src/stop.rs @@ -0,0 +1,250 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! `plm stop` — stop the in-progress WPR trace and write `trace.etl` +//! into a log directory. + +use anyhow::{Context, Result}; +use chrono::Local; +use std::path::{Path, PathBuf}; +use std::process::ExitStatus; + +use crate::wpr_path::wpr_command; + +pub struct StopOptions { + pub log_dir: Option, + pub bin_path: Option, + pub config_path: Option, + pub adjusted_config_path: Option, + /// When set, skip `wpr -stop` and treat the supplied .etl as the + /// captured trace. Useful for re-processing a previously captured + /// trace without an active WPR session. + pub trace_file: Option, + pub verbose: bool, +} + +/// Abstraction over `wpr -stop` invocations so the failure-mapping +/// state machine in `stop_plm_trace_with` is testable without +/// actually spawning processes. Mirrors `start::WprLauncher`. +pub trait WprStopper { + fn stop(&mut self, trace_file: &Path) -> Result; +} + +pub struct WprExeStopper; + +impl WprStopper for WprExeStopper { + fn stop(&mut self, trace_file: &Path) -> Result { + // Capture stdio rather than inheriting so a successful `wpr + // -stop` doesn't leak wpr chatter into any wrapping tool (e.g. + // `wxc-exec --audit`). On non-zero exit we replay the captured + // streams so operators can still see wpr's own diagnostic. + let mut cmd = wpr_command(); + let resolved = cmd.get_program().to_string_lossy().into_owned(); + let output = cmd + .args(["-stop", &trace_file.to_string_lossy()]) + .output() + .map_err(|e| anyhow::anyhow!("failed to spawn wpr -stop ({resolved}): {e}"))?; + if !output.status.success() { + crate::start::replay_wpr_output("stop", &output); + } + Ok(output.status) + } +} + +/// Testable wrapper for `wpr -stop` status handling. +pub fn stop_plm_trace_with(stopper: &mut S, trace_file: &Path) -> Result<()> { + let status = stopper.stop(trace_file)?; + if !status.success() { + anyhow::bail!("wpr -stop exited with {status}"); + } + Ok(()) +} + +fn stop_plm_trace(trace_file: &Path) -> Result<()> { + stop_plm_trace_with(&mut WprExeStopper, trace_file) +} + +/// Resolve `--bin-path` (or fall back to the calling exe directory) +/// to its canonical form. Exposed even though the self-access filter +/// consumer isn't wired here, so the canonicalize fallback chain is +/// pinned by tests. +/// +/// Fallback chain: +/// 1. `canonicalize(opt.bin_path)` if `Some` +/// 2. raw `opt.bin_path` if `Some` (with a warning) +/// 3. `exe_dir` (no warning) +pub fn resolve_bin_path(opt: Option<&Path>, exe_dir: &Path) -> (PathBuf, Option) { + let Some(raw) = opt else { + return (exe_dir.to_path_buf(), None); + }; + match raw.canonicalize() { + Ok(p) => (p, None), + Err(e) => { + let warning = format!( + "could not canonicalize --bin-path {} ({}); self-access filter \ + will use the raw path. Events referencing the binary via a \ + different spelling (e.g. verbatim \\\\?\\) may leak into the \ + adjusted config.", + raw.display(), + e + ); + // Prefer the raw operator-supplied path over silently + // substituting exe_dir; that would drop operator intent. + (raw.to_path_buf(), Some(warning)) + } + } +} + +pub fn run(opts: StopOptions, exe_dir: &Path) -> Result<()> { + // $LogDir defaults to "\logs\". The sub-second + // component makes parallel PLM runs finishing in the same second + // land in distinct directories. + let log_dir = opts.log_dir.unwrap_or_else(|| { + let stamp = Local::now().format("%Y-%m-%d_%H%M%S%.3f").to_string(); + exe_dir.join("logs").join(stamp) + }); + std::fs::create_dir_all(&log_dir) + .with_context(|| format!("failed to create log dir {}", log_dir.display()))?; + + // Resolve bin_path so the operator-facing warning path is + // exercised and the canonical form is on disk for downstream + // consumers, even though the self-access filter isn't wired here. + let (_bin_path, warning) = resolve_bin_path(opts.bin_path.as_deref(), exe_dir); + if let Some(w) = warning { + eprintln!("[plm] warning: {w}"); + } + + let trace_file = if let Some(p) = opts.trace_file.as_ref() { + // Operator supplied a pre-captured .etl -- don't try to stop a + // (likely non-existent) live WPR session. + if !p.exists() { + anyhow::bail!("trace file does not exist: {}", p.display()); + } + p.clone() + } else { + let p = log_dir.join("trace.etl"); + stop_plm_trace(&p)?; + p + }; + + println!("Trace captured at {}.", trace_file.display()); + + // `config_path` / `adjusted_config_path` are accepted today so the + // wxc-exec --audit harness can pass them through without breaking + // for downstream consumers. + if let Some(p) = opts.config_path.as_ref() { + let _ = p; + } + if let Some(p) = opts.adjusted_config_path.as_ref() { + let _ = p; + } + if opts.verbose { + println!("verbose logging is a no-op in this build."); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + // ---- resolve_bin_path ----------------------------------------------- + + #[test] + fn resolve_bin_path_falls_back_to_exe_dir_when_no_override() { + let exe = std::env::temp_dir(); + let (p, warn) = resolve_bin_path(None, &exe); + assert_eq!(p, exe); + assert!(warn.is_none(), "no operator intent means no warning"); + } + + #[test] + fn resolve_bin_path_canonicalizes_existing_override() { + let exe = std::env::temp_dir(); + let override_path = std::env::temp_dir(); + let (p, warn) = resolve_bin_path(Some(&override_path), &exe); + assert!(p.exists(), "canonicalized path should still exist"); + assert!(warn.is_none(), "successful canonicalize must not warn"); + } + + #[test] + fn resolve_bin_path_warns_and_returns_raw_when_canonicalize_fails() { + let exe = std::env::temp_dir(); + let bogus = std::path::PathBuf::from("Z:\\definitely-does-not-exist-plm-test"); + let (p, warn) = resolve_bin_path(Some(&bogus), &exe); + assert_eq!( + p, bogus, + "must return the raw operator path rather than silently \ + substituting exe_dir (would drop operator intent)" + ); + let w = warn.expect("canonicalize failure must surface a warning"); + assert!( + w.contains("Z:\\definitely-does-not-exist-plm-test"), + "warning must reference the failing path: {w}", + ); + } + + // ---- WprStopper / stop_plm_trace_with ------------------------------- + + use std::os::windows::process::ExitStatusExt; + + struct FakeStopper { + result: std::cell::Cell>>, + calls: std::cell::Cell, + } + impl FakeStopper { + fn ok(code: u32) -> Self { + Self { + result: std::cell::Cell::new(Some(Ok(ExitStatus::from_raw(code)))), + calls: std::cell::Cell::new(0), + } + } + fn err(msg: &'static str) -> Self { + Self { + result: std::cell::Cell::new(Some(Err(anyhow::anyhow!(msg)))), + calls: std::cell::Cell::new(0), + } + } + } + impl WprStopper for FakeStopper { + fn stop(&mut self, _trace_file: &Path) -> Result { + self.calls.set(self.calls.get() + 1); + self.result + .replace(None) + .expect("FakeStopper.stop called more than once") + } + } + + #[test] + fn stop_plm_trace_returns_ok_on_zero_exit() { + let mut s = FakeStopper::ok(0); + stop_plm_trace_with(&mut s, Path::new("trace.etl")) + .expect("zero-exit must propagate as Ok"); + assert_eq!(s.calls.get(), 1); + } + + #[test] + fn stop_plm_trace_propagates_nonzero_exit_with_context() { + let mut s = FakeStopper::ok(1); + let err = stop_plm_trace_with(&mut s, Path::new("trace.etl")) + .expect_err("non-zero exit must propagate as Err"); + let msg = format!("{err}"); + assert!( + msg.contains("wpr -stop exited"), + "error must name the failed command: {msg}", + ); + } + + #[test] + fn stop_plm_trace_propagates_spawn_error_verbatim() { + let mut s = FakeStopper::err("simulated spawn failure: not found"); + let err = stop_plm_trace_with(&mut s, Path::new("trace.etl")) + .expect_err("spawn error must propagate"); + let msg = format!("{err}"); + assert!( + msg.contains("simulated spawn failure"), + "error must surface the underlying io::Error context: {msg}", + ); + } +} diff --git a/src/core/plm/src/wpr_path.rs b/src/core/plm/src/wpr_path.rs new file mode 100644 index 00000000..11e652a0 --- /dev/null +++ b/src/core/plm/src/wpr_path.rs @@ -0,0 +1,179 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Locate `wpr.exe` by absolute path. +//! +//! `Command::new("wpr")` is unsafe: on Windows it resolves via +//! `CreateProcessW`'s implicit DLL/EXE search order — and that order +//! starts with the **current working directory**. Because PLM runs as +//! administrator (required to start the NT Kernel Logger), an +//! unprivileged user who can drop a `wpr.exe` into a directory an +//! admin later runs PLM from would gain code execution as that admin. +//! +//! Reading `%SystemRoot%` from the process environment block is also +//! unsafe: UAC inherits the unelevated parent's env verbatim. A +//! standard user can `setx +//! SystemRoot=C:\\Users\\Public\\evil`, plant +//! `evil\\System32\\wpr.exe`, and the next admin-elevated +//! `wxc-exec --audit` (or any cleanup-path `wpr -cancel`) launches the +//! attacker binary as administrator — strictly worse than the +//! original CWD plant because env travels with elevation. +//! +//! This module resolves the System directory via `GetSystemDirectoryW` +//! (kernel-published, not env-spoofable) once at first call and caches +//! the result. All PLM call sites must go through `wpr_command()` +//! instead of `Command::new("wpr")` directly. +//! +//! We do **not** call `WinVerifyTrust` on the resolved `wpr.exe`. +//! System binaries under `%SystemDirectory%` are typically +//! catalog-signed (`.cat` files in `CatRoot\`) rather than +//! embedded-signed, so `WinVerifyTrust` with the generic file policy +//! returns `TRUST_E_NOSIGNATURE` (0x800B0100) on stock Windows +//! installs. Correctly verifying a catalog-signed binary requires the +//! `CryptCATAdmin*` fallback dance, and even then the trust boundary +//! it would enforce is "the file under `System32\\wpr.exe` was placed +//! there by an entity Windows trusts". Because we resolve that path +//! via `GetSystemDirectoryW` (not an attacker-controllable env var), +//! and any write to `%SystemDirectory%` requires `TrustedInstaller` +//! (or SYSTEM) — a strictly higher privilege than the admin +//! elevation PLM already runs at — the path resolution itself is our +//! security boundary. We keep `verify_wpr_signed` as a thin sanity +//! check that the binary actually exists at the resolved path. + +use std::path::PathBuf; +use std::process::Command; +use std::sync::OnceLock; + +/// Cached absolute path to `wpr.exe`, resolved on first use. +static WPR_PATH: OnceLock = OnceLock::new(); + +/// Resolve `\wpr.exe` via `GetSystemDirectoryW`. The kernel +/// publishes this value at process creation and the env block cannot +/// override it, so this is safe even when the parent (unelevated) +/// process set `SystemRoot` to an attacker-controlled directory. +/// +/// Falls back to `C:\\Windows\\System32\\wpr.exe` only if the API call +/// itself fails (which on a real Windows install does not happen). +#[cfg(target_os = "windows")] +fn resolve_wpr_path() -> PathBuf { + use windows::Win32::System::SystemInformation::GetSystemDirectoryW; + let mut buf = vec![0u16; 260]; + // SAFETY: buf is initialized; we pass a valid length and own the + // memory for the duration of the call. + let n = unsafe { GetSystemDirectoryW(Some(&mut buf)) }; + if n == 0 || (n as usize) > buf.len() { + // API failed or buffer somehow too small: use a hardcoded + // fallback rather than reading the env block. + return PathBuf::from("C:\\Windows\\System32\\wpr.exe"); + } + let dir = wxc_common::string_util::from_wide(&buf[..n as usize]); + let mut p = PathBuf::from(dir); + p.push("wpr.exe"); + p +} + +/// Sanity-check that the resolved `wpr.exe` actually exists on disk. +/// +/// The real security guarantee comes from `resolve_wpr_path` +/// (`GetSystemDirectoryW`, not env-spoofable) plus the OS +/// `TrustedInstaller`-only ACL on `%SystemDirectory%\\wpr.exe` — an +/// attacker who can plant a binary there has already escalated past +/// the admin token PLM runs under, so an in-process signature check +/// would be defence against a strictly higher privilege than the one +/// we hold. See the module doc for the full rationale. +/// +/// Returns `Err` if the resolved path doesn't exist on disk, which +/// indicates a broken/stripped Windows install (WPT not present) — +/// something the caller must surface with a clear message rather than +/// let `CreateProcess` fail cryptically later. +#[cfg(target_os = "windows")] +pub fn verify_wpr_signed() -> Result<(), String> { + let path = WPR_PATH.get_or_init(resolve_wpr_path); + if !path.is_file() { + return Err(format!( + "wpr.exe not found at {} — install the Windows Performance Toolkit \ + (part of the Windows ADK) and retry", + path.display() + )); + } + Ok(()) +} + +/// Non-Windows stub — PLM is Windows-only, but the crate builds +/// cross-platform for CI parity, so this always succeeds. +#[cfg(not(target_os = "windows"))] +pub fn verify_wpr_signed() -> Result<(), String> { + Ok(()) +} + +/// Return a `Command` rooted at the absolute `wpr.exe` path. Callers +/// should still build their own `.args(...)` chain on top. +/// +/// On Windows we tack on `CREATE_NO_WINDOW` (0x08000000) so the child +/// wpr.exe process has no attached console. wpr renders its +/// `100% [>>>>>>]` progress bar via `WriteConsoleW`, which writes +/// **directly to the console handle** — that bypasses any stdio pipe +/// redirection (`.stdout(Stdio::piped())` / `.output()`), so without +/// this flag the progress bar leaks onto the wrapping tool's terminal +/// even though we capture stdout/stderr. Regular `printf`-style +/// stdout/stderr traffic still gets captured through the pipes and is +/// replayed on failure via `replay_wpr_output`. +pub fn wpr_command() -> Command { + let p = WPR_PATH.get_or_init(resolve_wpr_path); + let mut cmd = Command::new(p); + #[cfg(target_os = "windows")] + { + use std::os::windows::process::CommandExt; + const CREATE_NO_WINDOW: u32 = 0x0800_0000; + cmd.creation_flags(CREATE_NO_WINDOW); + } + cmd +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn resolves_absolute_system_directory_wpr() { + let p = resolve_wpr_path(); + assert!( + p.is_absolute(), + "wpr path must be absolute: {}", + p.display() + ); + assert!( + p.ends_with("wpr.exe"), + "wpr path must end with wpr.exe: {}", + p.display() + ); + // The result must be under a `System32` (or `Sysnative` / + // `SysWOW64`) directory — never under user-writable paths. + let s = p.to_string_lossy().to_ascii_lowercase(); + assert!( + s.contains("\\system32\\") || s.contains("\\sysnative\\") || s.contains("\\syswow64\\"), + "wpr path must be under a system directory; got: {}", + p.display() + ); + } + + /// setting `SystemRoot` in the + /// process env MUST NOT change which `wpr.exe` we resolve, because + /// the kernel-published system directory is the source of truth. + #[test] + fn ignores_system_root_env_var() { + let original = std::env::var_os("SystemRoot"); + std::env::set_var("SystemRoot", "C:\\Users\\Public\\evil"); + let p = resolve_wpr_path(); + let s = p.to_string_lossy().to_ascii_lowercase(); + assert!( + !s.contains("public") && !s.contains("evil"), + "resolve_wpr_path honored attacker-controlled SystemRoot: {}", + p.display() + ); + match original { + Some(v) => std::env::set_var("SystemRoot", v), + None => std::env::remove_var("SystemRoot"), + } + } +} diff --git a/src/core/wxc/Cargo.toml b/src/core/wxc/Cargo.toml index 13eee1ae..28ec2c19 100644 --- a/src/core/wxc/Cargo.toml +++ b/src/core/wxc/Cargo.toml @@ -26,6 +26,9 @@ nanvix_runner = { workspace = true, optional = true } nanvix_binaries = { path = "../../backends/nanvix/binaries", optional = true } wslc_common = { workspace = true, optional = true } isolation_session_bindings = { workspace = true, optional = true } +# Shared PLM dep for the singleton env-var name and `wait_until_cleared` +# helper — single source of truth so the two crates can't drift. +plm = { path = "../plm" } [build-dependencies] mxc_build_common.workspace = true diff --git a/src/core/wxc/src/audit.rs b/src/core/wxc/src/audit.rs new file mode 100644 index 00000000..24315df8 --- /dev/null +++ b/src/core/wxc/src/audit.rs @@ -0,0 +1,265 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Graceful-exit PLM audit-trace lifecycle for `wxc-exec --audit`. +//! +//! `--audit` runs `plm.exe start`, which leaves a live WPR ETW session +//! in the kernel for the duration of the workload. The matching +//! `plm.exe stop` tears it down. If anything between those two calls +//! aborts wxc-exec — Ctrl-C, panic, `process::exit`, container-runner +//! kill — the kernel session stays allocated until reboot or manual +//! `wpr -cancel`, blocking all other WPR consumers on the host (only +//! one NT Kernel Logger session can exist at a time). +//! +//! We bracket the live-trace window with `AUDIT_ACTIVE` plus a stack- +//! owned `AuditTraceGuard`. Cleanup paths: +//! * Normal exit and panic unwind — `AuditTraceGuard::drop` invokes +//! `cancel_active_audit_trace()`. +//! * Ctrl-C / Ctrl-Break / console close — the `dacl_ctrl_handler` +//! (in `main.rs`) also calls `cancel_active_audit_trace()` after +//! handling DACLs. +//! +//! `cancel_active_audit_trace()` is idempotent via the AtomicBool, so +//! it is safe for both paths to call it. +//! +//! The host-wide named-mutex singleton (`Global\Mxc_Plm_Audit`) is +//! shared with `plm.exe`; both binaries acquire and release it via +//! `plm::coordination::singleton` so their retry-on-conflict paths can +//! never silently `wpr -cancel` a peer trace. + +use std::sync::atomic::{AtomicBool, AtomicIsize, Ordering}; + +use wxc_common::logger::Logger; + +/// Path to `plm.exe`, expected to sit next to `wxc-exec.exe` in the +/// same install directory. Returns `None` when the current exe path +/// can't be resolved. +pub fn plm_exe_path() -> Option { + std::env::current_exe() + .ok() + .and_then(|p| p.parent().map(|d| d.join("plm.exe"))) +} + +/// Run `plm.exe ` synchronously and route stdio +/// through to wxc-exec's console. Audit tracing is a best-effort +/// diagnostic: missing-binary / spawn / non-zero-exit conditions are +/// logged and returned as `false` — this function never calls +/// `process::exit` on its own. The caller (currently the `--audit` +/// entry point) is responsible for deciding whether a `false` return +/// should abort the workload; today the `plm start` caller does abort +/// rather than run --audit without an active trace, while `plm stop` +/// merely falls through to the `wpr -cancel` cleanup path. +/// +/// Returns `true` iff the spawn succeeded **and** plm.exe exited with +/// a zero status. The caller needs this signal to decide whether to +/// clear `AUDIT_ACTIVE` (only after a successful `plm stop`); without +/// it, `AUDIT_ACTIVE.store(false)` would run unconditionally and +/// silently leak the kernel ETW session on every failure path. +pub fn run_plm_command(args: &[&std::ffi::OsStr], logger: &mut Logger, verbose: bool) -> bool { + use std::fmt::Write as _; + + let Some(plm) = plm_exe_path() else { + let _ = writeln!(logger, "[audit] could not resolve plm.exe path"); + return false; + }; + if !plm.exists() { + let _ = writeln!( + logger, + "[audit] plm.exe not found at {} - skipping", + plm.display() + ); + return false; + } + + let mut summary = String::new(); + let _ = write!(summary, "[audit] running {}", plm.display()); + for a in args { + let _ = write!(summary, " {}", a.to_string_lossy()); + } + let _ = writeln!(logger, "{summary}"); + if verbose { + eprintln!("{summary}"); + } + + match std::process::Command::new(&plm) + .args(args) + // plm.exe itself acquires the `Global\Mxc_Plm_Audit` named- + // mutex singleton on direct operator invocations (`plm log` / + // `plm start` / `plm stop`) so its retry-on-conflict path + // can't silently `wpr -cancel` a peer trace. When wxc-exec + // spawns plm.exe we already hold that mutex for the whole + // audit window — tell the child to skip its own acquisition + // so we don't deadlock on the same global name. The env-var + // name comes from `plm::coordination::SINGLETON_HELD_BY_PARENT_ENV` + // rather than a duplicated string literal, so the plm and + // wxc crates can't drift out of sync without a compile error. + .env(plm::coordination::SINGLETON_HELD_BY_PARENT_ENV, "1") + // Capture plm.exe's stdout/stderr rather than inheriting the + // caller's console. Audit tracing is a background side-effect + // of `wxc-exec --audit`; leaking plm's chatter into the + // workload's stdio breaks any consumer that parses wxc-exec's + // stdout. On failure (non-zero exit or spawn error) we replay + // the captured streams so operators can still diagnose. In + // verbose mode we replay unconditionally. + .output() + { + Ok(o) if o.status.success() => { + if verbose { + replay_child_output(logger, &o); + } + true + } + Ok(o) => { + let _ = writeln!(logger, "[audit] plm exited with status {}", o.status); + replay_child_output(logger, &o); + if verbose { + eprintln!("[audit] plm exited with status {}", o.status); + } + false + } + Err(e) => { + let _ = writeln!(logger, "[audit] failed to spawn plm: {e}"); + if verbose { + eprintln!("[audit] failed to spawn plm: {e}"); + } + false + } + } +} + +/// Replay a captured child's stdout/stderr to the current process's +/// own streams. Used on failure (and in verbose mode on success) so +/// the happy path can stay silent while diagnostics still surface. +fn replay_child_output(logger: &mut Logger, output: &std::process::Output) { + use std::fmt::Write as _; + use std::io::Write as _; + if !output.stdout.is_empty() { + let _ = std::io::stdout().write_all(&output.stdout); + let _ = write!(logger, "{}", String::from_utf8_lossy(&output.stdout)); + } + if !output.stderr.is_empty() { + let _ = std::io::stderr().write_all(&output.stderr); + let _ = write!(logger, "{}", String::from_utf8_lossy(&output.stderr)); + } +} + +pub static AUDIT_ACTIVE: AtomicBool = AtomicBool::new(false); + +/// Set to `true` while `plm start` is being spawned and has not yet +/// returned. `AUDIT_ACTIVE` is flipped to `true` BEFORE `plm.exe` is +/// spawned (because `mark_audit_active()` has to run early to cover a +/// Ctrl+C arriving mid-spawn), but the kernel ETW session is not +/// actually engaged until `plm.exe`'s child `wpr -start` returns. A +/// Ctrl+C in that gap would fire `wpr -cancel` against a not-yet- +/// existing session, then `wpr -start` would silently succeed AFTER +/// the cancel — leaking the session past `wxc-exec`'s own cleanup. We +/// close the race by making the Ctrl+C handler wait (bounded) until +/// `plm start` has finished its spawn round-trip before deciding +/// whether to issue the cancel. +pub static AUDIT_START_IN_FLIGHT: AtomicBool = AtomicBool::new(false); + +/// Mark that the wxc-exec process owns a live PLM audit trace. Called +/// just before `plm start` is spawned so a Ctrl-C arriving mid-spawn +/// still triggers cleanup (over-cancelling a not-yet-started session +/// is harmless — `wpr -cancel` returns non-zero and we discard). +pub fn mark_audit_active() { + AUDIT_ACTIVE.store(true, Ordering::SeqCst); +} + +/// Cancel an in-flight PLM audit trace iff one is active, then clear +/// the flag. Idempotent; safe to call from the Ctrl-C handler and the +/// stack guard's Drop. Failures (no active session, missing wpr.exe) +/// are silenced because the call is best-effort cleanup. +/// +/// Invokes `wpr.exe` by absolute path (`%SystemRoot%\System32\wpr.exe`) +/// rather than as a bare name so `CreateProcessW`'s implicit CWD-first +/// search order can't be abused to substitute a planted binary. +/// `wxc-exec --audit` is typically launched from an elevated +/// (administrator) context because starting a WPR kernel session +/// requires it, so a CWD-search hit here would give an attacker +/// elevated-equivalent code execution — hence the absolute System32 +/// path. +pub fn cancel_active_audit_trace() { + if AUDIT_ACTIVE.swap(false, Ordering::SeqCst) { + let wpr = resolve_system32_wpr(); + let _ = std::process::Command::new(&wpr) + .arg("-cancel") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status(); + } +} + +/// Resolve `\wpr.exe` via `GetSystemDirectoryW`. Reading +/// `%SystemRoot%` from the process env is unsafe because UAC inherits +/// env from the unelevated parent — a standard user could `setx +/// SystemRoot=C:\\Users\\Public\\evil` and plant `wpr.exe` for a later +/// admin run. `GetSystemDirectoryW` is kernel-published and not +/// env-spoofable. +fn resolve_system32_wpr() -> std::path::PathBuf { + use windows::Win32::System::SystemInformation::GetSystemDirectoryW; + let mut buf = vec![0u16; 260]; + // SAFETY: buf is initialized; we pass valid length and own the + // memory for the duration of the call. + let n = unsafe { GetSystemDirectoryW(Some(&mut buf)) }; + if n == 0 || (n as usize) > buf.len() { + return std::path::PathBuf::from("C:\\Windows\\System32\\wpr.exe"); + } + let dir = wxc_common::string_util::from_wide(&buf[..n as usize]); + let mut p = std::path::PathBuf::from(dir); + p.push("wpr.exe"); + p +} + +/// Stack-owned guard: ensures the audit trace is cancelled on panic +/// unwind and on normal function return. +pub struct AuditTraceGuard; + +impl Drop for AuditTraceGuard { + fn drop(&mut self) { + cancel_active_audit_trace(); + } +} + +/// Raw handle of the host-wide single-instance mutex for PLM audit +/// mode. Two concurrent `wxc-exec --audit` runs would share a single +/// NT Kernel Logger session, so the second one's `wpr -start` would +/// either steal the first's session or fail and silently corrupt the +/// first run's findings. We acquire a named mutex (`Global\\` so it's +/// machine-wide; admins have SeCreateGlobalPrivilege so this works) +/// and refuse to start if another wxc-exec audit is already running. +/// +/// The handle is stashed in a static atomic (not just the stack guard) +/// so the explicit cleanup before `process::exit` — which skips +/// destructors — can release it too. `AuditSingletonGuard::drop` is +/// a thin shim over `release_audit_singleton`; both paths are +/// idempotent. +static AUDIT_SINGLETON_HANDLE: AtomicIsize = AtomicIsize::new(0); + +pub struct AuditSingletonGuard; + +impl Drop for AuditSingletonGuard { + fn drop(&mut self) { + release_audit_singleton(); + } +} + +/// Release the host-wide audit singleton if held. Idempotent: safe to +/// call from `Drop`, from the explicit pre-`process::exit` cleanup, +/// and from error paths. +pub fn release_audit_singleton() { + plm::coordination::singleton::release(&AUDIT_SINGLETON_HANDLE); +} + +pub fn try_acquire_audit_singleton() -> Result { + use plm::coordination::singleton::{try_acquire, AcquireError}; + match try_acquire(&AUDIT_SINGLETON_HANDLE) { + Ok(()) => Ok(AuditSingletonGuard), + Err(AcquireError::AlreadyHeld) => Err(String::from( + "another wxc-exec --audit run holds the Global\\Mxc_Plm_Audit mutex; \ + refusing to start a second concurrent PLM trace (only one NT Kernel \ + Logger session can exist per host)", + )), + Err(AcquireError::CreateFailed(e)) => Err(format!("CreateMutexW failed: {e}")), + } +} diff --git a/src/core/wxc/src/elevation.rs b/src/core/wxc/src/elevation.rs new file mode 100644 index 00000000..a5a3daeb --- /dev/null +++ b/src/core/wxc/src/elevation.rs @@ -0,0 +1,168 @@ +//! Elevation detection and UAC self-relaunch for `wxc-exec --audit`. +//! +//! `--audit` starts a kernel ETW session via `wpr.exe` which requires +//! Administrator. Rather than fail with an opaque "Access denied" when +//! the caller isn't elevated, we detect the non-elevated state up-front +//! and re-launch ourselves with `ShellExecuteExW` + `runas`, triggering +//! the standard UAC prompt. We then wait for the elevated child, read +//! its exit code, and propagate it so the outer invoker sees the same +//! return contract. +//! +//! Only compiled on Windows. + +use std::env; +use std::os::windows::ffi::OsStrExt; +use std::path::PathBuf; + +use windows::core::PCWSTR; +use windows::Win32::Foundation::{CloseHandle, ERROR_CANCELLED, HANDLE, WAIT_FAILED}; +use windows::Win32::Security::{GetTokenInformation, TokenElevation, TOKEN_ELEVATION, TOKEN_QUERY}; +use windows::Win32::System::Threading::{ + GetCurrentProcess, GetExitCodeProcess, OpenProcessToken, WaitForSingleObject, INFINITE, +}; +use windows::Win32::UI::Shell::{ShellExecuteExW, SEE_MASK_NOCLOSEPROCESS, SHELLEXECUTEINFOW}; + +/// `SW_SHOWNORMAL` from `Win32_UI_WindowsAndMessaging`. Duplicated here +/// (as a literal) so we don't pull in that whole feature just for one +/// constant. +const SW_SHOWNORMAL: i32 = 1; + +/// Returns `true` if the current process token is elevated. +/// +/// Uses `TokenElevation` (available since Vista). On any API failure we +/// return `false` — a spurious re-launch is preferable to running +/// `wpr.exe` and failing mid-trace with a confusing error. +pub fn is_elevated() -> bool { + unsafe { + let mut token: HANDLE = HANDLE::default(); + if OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &mut token).is_err() { + return false; + } + let mut elevation = TOKEN_ELEVATION::default(); + let mut ret_len: u32 = 0; + let ok = GetTokenInformation( + token, + TokenElevation, + Some(&mut elevation as *mut _ as *mut _), + std::mem::size_of::() as u32, + &mut ret_len, + ); + let _ = CloseHandle(token); + match ok { + Ok(()) => elevation.TokenIsElevated != 0, + Err(_) => false, + } + } +} + +/// Encodes a UTF-16 null-terminated buffer for Win32 wide-string APIs. +fn to_wide>(s: S) -> Vec { + s.as_ref().encode_wide().chain(std::iter::once(0)).collect() +} + +/// Quotes a single command-line argument using CommandLineToArgvW rules. +/// +/// See MSDN "Parsing C++ Command-Line Arguments". We wrap in double +/// quotes and escape internal backslash runs preceding a `"` plus the +/// `"` itself. Empty arguments become `""`. +fn quote_arg(arg: &str) -> String { + if !arg.is_empty() && !arg.contains([' ', '\t', '"', '\n']) { + return arg.to_string(); + } + let mut out = String::with_capacity(arg.len() + 2); + out.push('"'); + let chars: Vec = arg.chars().collect(); + let mut i = 0; + while i < chars.len() { + let mut backslashes = 0; + while i < chars.len() && chars[i] == '\\' { + backslashes += 1; + i += 1; + } + if i == chars.len() { + for _ in 0..(backslashes * 2) { + out.push('\\'); + } + } else if chars[i] == '"' { + for _ in 0..(backslashes * 2 + 1) { + out.push('\\'); + } + out.push('"'); + i += 1; + } else { + for _ in 0..backslashes { + out.push('\\'); + } + out.push(chars[i]); + i += 1; + } + } + out.push('"'); + out +} + +/// Re-launches the current executable elevated with the same argv and +/// waits for it to exit. Returns the child's exit code on success, or +/// an error describing the failure (UAC declined, ShellExecute error). +pub fn relaunch_elevated_and_wait() -> Result { + let exe: PathBuf = env::current_exe().map_err(|e| format!("current_exe failed: {e}"))?; + + // Rebuild the argument string from argv (skip argv[0]). + let args: Vec = env::args().skip(1).map(|a| quote_arg(&a)).collect(); + let params = args.join(" "); + + let verb_w = to_wide("runas"); + let file_w = to_wide(exe.as_os_str()); + let params_w = to_wide(¶ms); + let cwd_w = to_wide( + env::current_dir() + .map(|p| p.into_os_string()) + .unwrap_or_default(), + ); + + let mut sei = SHELLEXECUTEINFOW { + cbSize: std::mem::size_of::() as u32, + fMask: SEE_MASK_NOCLOSEPROCESS, + lpVerb: PCWSTR(verb_w.as_ptr()), + lpFile: PCWSTR(file_w.as_ptr()), + lpParameters: PCWSTR(params_w.as_ptr()), + lpDirectory: PCWSTR(cwd_w.as_ptr()), + nShow: SW_SHOWNORMAL, + ..Default::default() + }; + + let result = unsafe { ShellExecuteExW(&mut sei) }; + if let Err(e) = result { + // Distinguish UAC decline (ERROR_CANCELLED = 1223) from other + // failures so the caller can surface a friendlier message. + let code = e.code(); + let raw = code.0 as u32 & 0xFFFF; + if raw == ERROR_CANCELLED.0 { + return Err("UAC prompt was cancelled; --audit requires elevation.".to_string()); + } + return Err(format!("ShellExecuteExW failed: {e}")); + } + + let proc_handle = sei.hProcess; + if proc_handle.is_invalid() { + return Err("ShellExecuteExW returned no process handle".to_string()); + } + + // Wait for the elevated child and read its exit code. + let wait = unsafe { WaitForSingleObject(proc_handle, INFINITE) }; + if wait == WAIT_FAILED { + unsafe { + let _ = CloseHandle(proc_handle); + } + return Err("WaitForSingleObject failed on elevated child".to_string()); + } + let mut exit_code: u32 = 0; + let rc = unsafe { GetExitCodeProcess(proc_handle, &mut exit_code) }; + unsafe { + let _ = CloseHandle(proc_handle); + } + if rc.is_err() { + return Err("GetExitCodeProcess failed on elevated child".to_string()); + } + Ok(exit_code as i32) +} diff --git a/src/core/wxc/src/main.rs b/src/core/wxc/src/main.rs index 9ae68a28..78f55985 100644 --- a/src/core/wxc/src/main.rs +++ b/src/core/wxc/src/main.rs @@ -1,9 +1,15 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +#[cfg(target_os = "windows")] +mod audit; +#[cfg(target_os = "windows")] +mod elevation; + use std::fmt::Write; use std::fs; use std::process; +use std::sync::atomic::Ordering; use std::sync::{Mutex, OnceLock}; use std::time::Instant; @@ -116,6 +122,27 @@ struct Cli { #[arg(long)] probe: bool, + /// Audit mode: inject the `permissiveLearningMode` capability into the + /// AppContainer policy so denied operations are logged but allowed. + /// Windows-only — the PLM trace pipeline (WPR/ETW) and the runner-side + /// `request.audit` consumer (AppContainer) have no cross-platform + /// counterpart, so accepting the flag elsewhere would print a misleading + /// "restrictions will NOT be enforced" warning while the bubblewrap/ + /// seatbelt backends silently ignore both the flag and the injected + /// capability. + #[cfg(target_os = "windows")] + #[arg(long)] + audit: bool, + + /// Surface the PLM lifecycle diagnostics (spawn banner, plm.exe stderr + /// lines, non-zero-exit / spawn-failure reasons) on wxc-exec's stderr in + /// addition to the log buffer. Off by default so `--audit` doesn't pollute + /// the wrapped workload's stdout/stderr; opt in when debugging the audit + /// pipeline itself. + #[cfg(target_os = "windows")] + #[arg(long)] + audit_verbose: bool, + /// Command to run inside the container, overriding `process.commandLine` /// from the policy. The command must follow a `--` separator so normal /// CLI flags remain usable after the config path. Examples: @@ -300,6 +327,31 @@ fn config_input(cli: &Cli) -> Option<(String, bool)> { } } +/// Resolve the on-disk config path passed to `wxc-exec`, if any. Returns +/// `None` when the config was supplied as `--config-base64` (no file path) +/// or not at all. Used by `--audit` to thread `plm stop --config-path` so +/// findings can be merged back into the source policy. +#[cfg(target_os = "windows")] +fn config_file_path(cli: &Cli) -> Option { + if cli.config_base64.is_some() { + return None; + } + cli.config + .as_ref() + .or(cli.config_path.as_ref()) + .map(std::path::PathBuf::from) +} + +/// Path to `plm.exe`, expected to sit next to `wxc-exec.exe` in the +/// same install directory. Returns `None` when the current exe path +/// can't be resolved. +#[cfg(target_os = "windows")] +use audit::{ + cancel_active_audit_trace, mark_audit_active, release_audit_singleton, run_plm_command, + try_acquire_audit_singleton, AuditSingletonGuard, AuditTraceGuard, AUDIT_ACTIVE, + AUDIT_START_IN_FLIGHT, +}; + // --------------------------------------------------------------------------- // Graceful-exit DACL cleanup // --------------------------------------------------------------------------- @@ -392,23 +444,28 @@ impl Drop for ParkedDaclGuard { /// Returns `FALSE` so the default handler still runs (which terminates /// the process). /// -/// Acquires the slot with a bounded wait (≤5s), not `try_lock`. If the -/// main thread is mid-`Drop` on the same manager — which can be doing a -/// `SetNamedSecurityInfoW` — returning FALSE immediately lets the -/// default handler call `ExitProcess`, terminating that drop mid-Win32 -/// and leaving the host DACL in an inconsistent state. The bounded -/// wait blocks the default handler until either main finishes (lock -/// released) or 5s elapses — whichever comes first. On timeout we -/// proceed anyway; the recovery scan on the next `wxc-exec` startup -/// reaps anything left behind. +/// Acquires the slot with a bounded wait +/// (`plm::coordination::CTRL_HANDLER_DRAIN_TIMEOUT`), not `try_lock`. +/// If the main thread is mid-`Drop` on the same manager — which can be +/// doing a `SetNamedSecurityInfoW` — returning FALSE immediately lets +/// the default handler call `ExitProcess`, terminating that drop mid- +/// Win32 and leaving the host DACL in an inconsistent state. The +/// bounded wait blocks the default handler until either main finishes +/// (lock released) or the shared timeout elapses — whichever comes +/// first. On timeout we proceed anyway; the recovery scan on the next +/// `wxc-exec` startup reaps anything left behind. unsafe extern "system" fn dacl_ctrl_handler(_ctrl_type: u32) -> windows::core::BOOL { if let Some(slot) = DACL_CLEANUP_SLOT.get() { use std::time::{Duration, Instant}; - // 5s mirrors the WaitForSingleObject pattern recommended for - // graceful-shutdown handlers; tuned to be longer than a worst- - // case `SetNamedSecurityInfoW` on a deep tree but well under - // the Windows default 10s shutdown-handler budget. - let deadline = Instant::now() + Duration::from_secs(5); + // The handler runs TWO bounded waits (this one + the + // AUDIT_START_IN_FLIGHT wait below) before `wpr -cancel`, and + // CTRL_CLOSE_EVENT / CTRL_LOGOFF / CTRL_SHUTDOWN have a hard + // ~5s OS-imposed kill budget. The per-wait budget is sourced + // from the shared `plm::coordination::CTRL_HANDLER_DRAIN_TIMEOUT` + // so `wxc-exec` and `plm.exe`'s `plm_ctrl_handler` cannot + // drift apart, and the budget invariant is pinned by a unit + // test (`ctrl_handler_drain_timeout_respects_os_budget`). + let deadline = Instant::now() + plm::coordination::CTRL_HANDLER_DRAIN_TIMEOUT; loop { if let Ok(mut guard) = slot.try_lock() { // Either main already took the manager (guard is None) @@ -427,6 +484,32 @@ unsafe extern "system" fn dacl_ctrl_handler(_ctrl_type: u32) -> windows::core::B } // FALSE = "I did not fully handle this; run the next handler in the // chain (i.e. the default handler that calls ExitProcess)". + // + // if `plm start` is still in + // flight when Ctrl+C arrives, wait briefly for it to complete + // before deciding whether to issue `wpr -cancel`. Without this + // wait, a cancel that races a not-yet-engaged session is a no-op + // and the session leaks past wxc-exec exit. On timeout we proceed + // anyway — the next-startup `recover_orphaned_state` scan plus a + // manual `wpr -cancel` would catch any residue. + // + // the wait loop is + // implemented by `plm::coordination::wait_until_cleared`, the + // same tested helper `plm.exe`'s console-control handler uses. + // + // the per-wait + // timeout is sourced from the shared + // `plm::coordination::CTRL_HANDLER_DRAIN_TIMEOUT` const so the + // wxc-exec and plm.exe handlers cannot drift apart. The const's + // docs (and the + // `ctrl_handler_drain_timeout_respects_os_budget` unit test) + // pin the ~5s OS kill-budget invariant. + let _ = plm::coordination::wait_until_cleared( + &AUDIT_START_IN_FLIGHT, + plm::coordination::CTRL_HANDLER_DRAIN_TIMEOUT, + std::time::Duration::from_millis(50), + ); + cancel_active_audit_trace(); windows::core::BOOL(0) } @@ -502,6 +585,26 @@ fn main() { return; } + // --audit needs Administrator (wpr.exe starts a kernel ETW session). + // Detect non-elevated up-front and re-launch under UAC so the caller + // sees the standard consent dialog instead of a mid-trace failure. + // Runs AFTER `--probe` (probe is non-privileged and used on cold + // startup) but BEFORE COM init / SetConsoleCtrlHandler so the + // relaunched elevated child does that work in a clean process. + #[cfg(target_os = "windows")] + if cli.audit && !elevation::is_elevated() { + if cli.audit_verbose { + eprintln!("[mxc] --audit requires elevation; requesting UAC and re-launching."); + } + match elevation::relaunch_elevated_and_wait() { + Ok(code) => process::exit(code), + Err(msg) => { + eprintln!("Error: {msg}"); + process::exit(1); + } + } + } + // Initialize COM/WinRT for backends that use WinRT APIs (Isolation Session). // COINIT_MULTITHREADED is benign for backends that don't use COM. // @@ -705,6 +808,10 @@ fn main() { request.experimental_enabled = cli.experimental; request.testing_features_enabled = cli.allow_testing_features; request.dry_run = cli.dry_run; + #[cfg(target_os = "windows")] + { + request.audit = cli.audit; + } // Apply the CLI command-line override to one-shot requests. State-aware // exec is handled above before dispatch. @@ -721,6 +828,37 @@ fn main() { }; apply_command_override(&mut request, command_override.as_deref(), &mut logger); + // --audit injects permissiveLearningMode so denied operations are logged + // but allowed. Works in both debug and release builds; in release the + // runner-side rejection is relaxed because request.audit is set. + // Windows-only: the flag itself only exists on Windows (see `Cli::audit`). + // + // Downstream capability lookups are case-sensitive (the AppContainer + // runner does exact string matches against the JSON capability name), + // so the "already present?" check here matches case-sensitively too. + // An operator who explicitly wrote a mis-cased spelling in the config + // gets a second, correctly-cased entry appended rather than silently + // relying on the mis-cased one that downstream lookups will ignore. + #[cfg(target_os = "windows")] + if cli.audit + && !request + .policy + .capabilities + .iter() + .any(|c| c == "permissiveLearningMode") + { + request + .policy + .capabilities + .push("permissiveLearningMode".to_string()); + logger.log("WARNING: --audit enabled - AppContainer restrictions will NOT be enforced\n"); + if cli.audit_verbose { + eprintln!( + "[mxc] permissiveLearningMode injected via --audit - AppContainer restrictions are NOT enforced" + ); + } + } + // Final validation: a command line must come from somewhere. If neither // the policy nor the CLI supplied one we cannot proceed. if request.script_code.is_empty() { @@ -975,11 +1113,130 @@ fn main() { } }; + // --audit: start the PLM (permissive learning mode) WPR trace before + // the runner spawns the container so we capture access-denied events + // for the lifetime of the workload. The matching `plm stop` below + // tears the trace down and (when the policy came from a file) + // merges findings back into it. Both calls are best-effort. + // + // Bracket the live-trace window with `AUDIT_ACTIVE` + a stack guard + // so Ctrl-C / panic / process::exit between start and stop don't + // leak the kernel ETW session. + // + // declaration order matters. Rust + // drops locals in REVERSE declaration order, and on the cleanup + // path we want the trace guard (`AuditTraceGuard`, which calls + // `wpr -cancel`) to run BEFORE the singleton handle is released — + // otherwise a concurrent wxc-exec could acquire the freed mutex + // and start its own trace, only to have our stale `wpr -cancel` + // tear it down. Declare the singleton first so it drops last. + #[cfg(target_os = "windows")] + let _audit_singleton: Option; + #[cfg(target_os = "windows")] + let _audit_guard: Option; + #[cfg(target_os = "windows")] + let audit_config_file = if cli.audit { + // refuse to start a second concurrent + // audit. We acquire the host-wide named mutex BEFORE marking + // AUDIT_ACTIVE so a failure here doesn't engage the cleanup + // path that would cancel someone else's running trace. + match try_acquire_audit_singleton() { + Ok(g) => _audit_singleton = Some(g), + Err(msg) => { + let _ = writeln!(logger, "[audit] {msg}"); + eprintln!("error: {msg}"); + _audit_singleton = None; + _audit_guard = None; + std::process::exit(1); + } + } + mark_audit_active(); + _audit_guard = Some(AuditTraceGuard); + // previously this was + // `let _ = run_plm_command(...)`, which discarded the failure + // status. If plm start failed (missing plm.exe, wpr session + // conflict not resolved, etc.), the workload ran with + // `permissiveLearningMode` injected into the sandbox policy + // but with zero WPR recording — an empty Adjusted_*.json + // looked like "no denials." Bail explicitly on start failure + // so the operator sees the error and the policy isn't + // silently relaxed. + // + // bracket the spawn with + // AUDIT_START_IN_FLIGHT so the console-control handler waits + // for it to drain before deciding whether to issue `wpr + // -cancel` (closes the Ctrl+C race where cancel arrives + // before `plm.exe`'s child `wpr -start` has engaged the + // kernel session). + AUDIT_START_IN_FLIGHT.store(true, Ordering::SeqCst); + let start_ok = run_plm_command( + &[std::ffi::OsStr::new("start")], + &mut logger, + cli.audit_verbose, + ); + AUDIT_START_IN_FLIGHT.store(false, Ordering::SeqCst); + if !start_ok { + let _ = writeln!( + logger, + "[audit] plm start failed; refusing to run the workload with \ + permissiveLearningMode but no WPR recording" + ); + eprintln!( + "error: plm start failed; refusing to run --audit without an \ + active trace. See logs for details." + ); + // cancel_active_audit_trace is idempotent and safe to call + // even if start never began a session — it inspects the + // AUDIT_ACTIVE flag and only invokes wpr -cancel if set. + cancel_active_audit_trace(); + std::process::exit(1); + } + config_file_path(&cli) + } else { + _audit_guard = None; + _audit_singleton = None; + None + }; + let run_start = Instant::now(); let response = runner.run(&request, &mut logger); let run_elapsed = run_start.elapsed(); let _ = writeln!(logger, "Runner completed in {}ms", run_elapsed.as_millis()); + // Tear down the PLM trace after the container exits, regardless of + // its exit code. Done before the runner is dropped so the trace + // tooling sees a fully-quiesced workload. + // + // only clear `AUDIT_ACTIVE` when `plm stop` actually + // succeeded. Previously the flag was cleared unconditionally, + // which silently leaked the kernel ETW session whenever stop + // failed (missing plm.exe, spawn fail, wpr -stop non-zero) and + // simultaneously turned `AuditTraceGuard::drop` and the Ctrl-C + // handler into no-ops. On failure we now leave the flag set so + // the stack guard's `Drop` runs `wpr -cancel` for us. + #[cfg(target_os = "windows")] + if cli.audit { + let mut stop_args: Vec = vec![std::ffi::OsString::from("stop")]; + if let Some(cfg) = audit_config_file.as_ref() { + stop_args.push(std::ffi::OsString::from("--config-path")); + stop_args.push(cfg.clone().into_os_string()); + } + let borrowed: Vec<&std::ffi::OsStr> = stop_args + .iter() + .map(std::ffi::OsString::as_os_str) + .collect(); + let stop_ok = run_plm_command(&borrowed, &mut logger, cli.audit_verbose); + if stop_ok { + AUDIT_ACTIVE.store(false, Ordering::SeqCst); + } else { + let _ = writeln!( + logger, + "[audit] plm stop failed; leaving AUDIT_ACTIVE set so cleanup guards \ + will run wpr -cancel on exit" + ); + } + } + // Explicitly drop the runner before retrieving the parked DACL // manager so any runner-internal resources holding child handles // release first; then drop the manager so its `restore()` runs. @@ -990,6 +1247,20 @@ fn main() { drop(runner); drop(take_parked_dacl()); + // the `process::exit` below skips destructors, so + // `AuditTraceGuard::drop` (which calls `cancel_active_audit_trace`) + // and `AuditSingletonGuard::drop` (which releases the host-wide + // named mutex) never run on the normal path. Leaving `AUDIT_ACTIVE` + // set so cleanup guards run `wpr -cancel` on stop failure is only + // true on the panic-unwind / Ctrl-C path, not here. Manually + // invoke the cleanups so a stop-failure path actually tears the + // kernel ETW session down and frees the singleton. + #[cfg(target_os = "windows")] + { + cancel_active_audit_trace(); + release_audit_singleton(); + } + if cli.dry_run { handle_dry_run_exit(&response, &mut logger); } diff --git a/src/core/wxc_common/src/config_parser.rs b/src/core/wxc_common/src/config_parser.rs index 56b25257..aed6939a 100644 --- a/src/core/wxc_common/src/config_parser.rs +++ b/src/core/wxc_common/src/config_parser.rs @@ -709,6 +709,9 @@ fn convert_wire_config( .capabilities .push("permissiveLearningMode".to_string()); logger.log("WARNING: 'learningMode' enabled - AppContainer restrictions will NOT be enforced (DEBUG BUILD ONLY)\n"); + eprintln!( + "[mxc] permissiveLearningMode injected via 'learningMode: true' - AppContainer restrictions are NOT enforced" + ); } #[cfg(not(debug_assertions))] { @@ -717,6 +720,12 @@ fn convert_wire_config( } if let Some(caps) = ac.capabilities { + #[cfg(debug_assertions)] + if caps.iter().any(|c| c == "permissiveLearningMode") { + eprintln!( + "[mxc] permissiveLearningMode present in policy capabilities - AppContainer restrictions are NOT enforced" + ); + } policy.capabilities.extend(caps); } @@ -1013,6 +1022,7 @@ fn convert_wire_config( testing_features_enabled: false, experimental, dry_run: false, + audit: false, }) } diff --git a/src/core/wxc_common/src/models.rs b/src/core/wxc_common/src/models.rs index 9ccee184..cc5ea4b8 100644 --- a/src/core/wxc_common/src/models.rs +++ b/src/core/wxc_common/src/models.rs @@ -691,6 +691,9 @@ pub struct ExecutionRequest { /// Dry-run mode: validate config and runner setup then return success /// without executing the sandboxed process. pub dry_run: bool, + /// Audit mode: when true, `permissiveLearningMode` is permitted even in + /// release builds (with a security warning). Set by the `--audit` CLI flag. + pub audit: bool, } /// Distinguishes whether an error occurred during process creation (launch)