Skip to content

Commit 4558603

Browse files
committed
replace self-exec with thread-based FSNotify worker
1 parent e368d60 commit 4558603

File tree

3 files changed

+234
-296
lines changed

3 files changed

+234
-296
lines changed

vminitd/Package.swift

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ let package = Package(
4747
.executableTarget(
4848
name: "vminitd",
4949
dependencies: [
50-
.product(name: "ArgumentParser", package: "swift-argument-parser"),
5150
.product(name: "Logging", package: "swift-log"),
5251
.product(name: "_NIOFileSystem", package: "swift-nio"),
5352
.product(name: "Containerization", package: "containerization"),

vminitd/Sources/vminitd/Application.swift

Lines changed: 49 additions & 235 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
// limitations under the License.
1515
//===----------------------------------------------------------------------===//
1616

17-
import ArgumentParser
1817
import Containerization
1918
import ContainerizationError
2019
import ContainerizationOS
@@ -26,46 +25,29 @@ import NIOPosix
2625
#if os(Linux)
2726
import Musl
2827
import LCShim
29-
#else
30-
import Darwin
3128
#endif
3229

3330
@main
34-
struct Application: AsyncParsableCommand {
35-
static let configuration = CommandConfiguration(
36-
commandName: "vminitd",
37-
abstract: "VM init process and container agent",
38-
version: "1.0.0",
39-
subcommands: [
40-
InitCommand.self,
41-
FsNotifyCommand.self,
42-
],
43-
defaultSubcommand: InitCommand.self
44-
)
31+
struct Application {
32+
private static let foregroundEnvVar = "FOREGROUND"
33+
private static let vsockPort = 1024
34+
private static let standardErrorLock = NSLock()
4535

46-
static let foregroundEnvVar = "FOREGROUND"
47-
static let vsockPort = 1024
48-
static let standardErrorLock = NSLock()
49-
50-
static func runInForeground(_ log: Logger) throws {
51-
precondition(getpid() != 1, "runInForeground must not be called as PID 1")
52-
log.info("running vminitd under pid1 wrapper")
36+
private static func runInForeground(_ log: Logger) throws {
37+
log.info("running vminitd under pid1")
5338

5439
var command = Command("/sbin/vminitd")
5540
command.attrs = .init(setsid: true)
5641
command.stdin = .standardInput
5742
command.stdout = .standardOutput
5843
command.stderr = .standardError
59-
60-
var env = ProcessInfo.processInfo.environment
61-
env[foregroundEnvVar] = "1"
62-
command.environment = env.map { "\($0.key)=\($0.value)" }
44+
command.environment = ["\(foregroundEnvVar)=1"]
6345

6446
try command.start()
6547
_ = try command.wait()
6648
}
6749

68-
static func adjustLimits() throws {
50+
private static func adjustLimits() throws {
6951
var limits = rlimit()
7052
guard getrlimit(RLIMIT_NOFILE, &limits) == 0 else {
7153
throw POSIXError(.init(rawValue: errno)!)
@@ -78,232 +60,64 @@ struct Application: AsyncParsableCommand {
7860
}
7961

8062
@Sendable
81-
static func standardError(label: String) -> StreamLogHandler {
63+
private static func standardError(label: String) -> StreamLogHandler {
8264
standardErrorLock.withLock {
8365
StreamLogHandler.standardError(label: label)
8466
}
8567
}
8668

87-
static func exit(_ code: Int32) -> Never {
88-
#if os(Linux)
89-
Musl.exit(code)
90-
#else
91-
Darwin.exit(code)
92-
#endif
93-
}
94-
}
95-
96-
extension Application {
97-
struct InitCommand: AsyncParsableCommand {
98-
static let configuration = CommandConfiguration(
99-
commandName: "init",
100-
abstract: "Run vminitd as init process (default)"
101-
)
102-
103-
func run() async throws {
104-
LoggingSystem.bootstrap(Application.standardError)
105-
var log = Logger(label: "vminitd")
106-
107-
try Application.adjustLimits()
108-
109-
// when running under debug mode, launch vminitd as a sub process of pid1
110-
// so that we get a chance to collect better logs and errors before pid1 exists
111-
// and the kernel panics.
112-
#if DEBUG
113-
let environment = ProcessInfo.processInfo.environment
114-
let foreground = environment[Application.foregroundEnvVar]
115-
let isPid1 = (getpid() == 1)
116-
log.info("checking for shim var \(Application.foregroundEnvVar)=\(String(describing: foreground)); pid=\(getpid())")
69+
static func main() async throws {
70+
LoggingSystem.bootstrap(standardError)
71+
var log = Logger(label: "vminitd")
11772

118-
// only use the FOREGROUND shim when we're not PID 1
119-
// if we are PID 1 (fresh VM boot), skip the shim to avoid exiting init
120-
if foreground == nil && !isPid1 {
121-
try Application.runInForeground(log)
122-
Application.exit(0) // parent is not PID 1; safe to exit after child completes
123-
}
73+
try adjustLimits()
12474

125-
// we only need to be a subreaper when we're not PID 1
126-
// (when PID 1, the kernel already reaps children)
127-
if !isPid1 {
128-
CZ_set_sub_reaper()
129-
}
130-
#endif
75+
// when running under debug mode, launch vminitd as a sub process of pid1
76+
// so that we get a chance to collect better logs and errors before pid1 exists
77+
// and the kernel panics.
78+
#if DEBUG
79+
let environment = ProcessInfo.processInfo.environment
80+
let foreground = environment[Self.foregroundEnvVar]
81+
log.info("checking for shim var \(foregroundEnvVar)=\(String(describing: foreground))")
13182

132-
signal(SIGPIPE, SIG_IGN)
133-
134-
// Because the sysctl rpc wouldn't make sense if this didn't always exist, we
135-
// ALWAYS mount /proc.
136-
guard Musl.mount("proc", "/proc", "proc", 0, "") == 0 else {
137-
log.error("failed to mount /proc")
138-
Application.exit(1)
139-
}
140-
guard Musl.mount("tmpfs", "/run", "tmpfs", 0, "") == 0 else {
141-
log.error("failed to mount /run")
142-
Application.exit(1)
143-
}
144-
try Binfmt.mount()
145-
146-
log.logLevel = .debug
147-
148-
log.info("vminitd booting")
149-
let eg = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount)
150-
let server = Initd(log: log, group: eg)
151-
152-
do {
153-
log.info("serving vminitd API")
154-
try await server.serve(port: Application.vsockPort)
155-
log.info("vminitd API returned")
156-
} catch {
157-
log.error("vminitd boot error \(error)")
158-
Application.exit(1)
159-
}
83+
if foreground == nil {
84+
try runInForeground(log)
85+
exit(0)
16086
}
161-
}
162-
}
163-
164-
extension Application {
165-
struct FsNotifyCommand: ParsableCommand {
166-
static let configuration = CommandConfiguration(
167-
commandName: "fs-notify",
168-
abstract: "Internal command to run filesystem notification worker in container namespace",
169-
shouldDisplay: false
170-
)
171-
172-
@Argument(help: "Container PID whose namespace to enter")
173-
var containerPID: Int32
174-
175-
private static let handshakeReady: UInt8 = 0xAA
176-
private static let handshakeFailure: UInt8 = 0xFF
17787

178-
func run() throws {
179-
// FD 3 = socket (extraFiles[0]), FD 4 = error pipe (extraFiles[1])
180-
let socketFD: Int32 = 3
181-
let errorPipeFD: Int32 = 4
182-
183-
do {
184-
try enterContainerNamespace(containerPID: containerPID)
185-
close(errorPipeFD)
186-
} catch {
187-
let errorMsg = "Failed to enter namespace: \(error)"
188-
_ = errorMsg.utf8CString.withUnsafeBufferPointer { buffer in
189-
// -1 to skip null terminator
190-
write(errorPipeFD, buffer.baseAddress, buffer.count - 1)
191-
}
192-
close(errorPipeFD)
193-
194-
var failureHandshake = Self.handshakeFailure
195-
_ = write(socketFD, &failureHandshake, 1)
196-
close(socketFD)
197-
Application.exit(1)
198-
}
199-
200-
var readyHandshake = Self.handshakeReady
201-
guard write(socketFD, &readyHandshake, 1) == 1 else {
202-
close(socketFD)
203-
Application.exit(1)
204-
}
205-
206-
while true {
207-
do {
208-
guard let (path, eventType) = try readEventFromParent(socket: socketFD) else {
209-
break
210-
}
211-
212-
do {
213-
try generateSyntheticInotifyEvent(path: path, eventType: eventType)
214-
} catch {
215-
// Log detailed error to stderr (captured by parent)
216-
let errorMsg = "Failed to generate inotify event: path=\(path), type=\(eventType), error=\(error)"
217-
fputs(errorMsg + "\n", stderr)
218-
fflush(stderr)
219-
}
88+
// since we are not running as pid1 in this mode we must set ourselves
89+
// as a subpreaper so that all child processes are reaped by us and not
90+
// passed onto our parent.
91+
CZ_set_sub_reaper()
92+
#endif
22093

221-
} catch {
222-
// Log and exit
223-
fputs("Protocol error reading from parent: \(error)\n", stderr)
224-
fflush(stderr)
225-
break
226-
}
227-
}
94+
signal(SIGPIPE, SIG_IGN)
22895

229-
close(socketFD)
96+
// Because the sysctl rpc wouldn't make sense if this didn't always exist, we
97+
// ALWAYS mount /proc.
98+
guard Musl.mount("proc", "/proc", "proc", 0, "") == 0 else {
99+
log.error("failed to mount /proc")
100+
exit(1)
230101
}
231-
232-
private func enterContainerNamespace(containerPID: Int32) throws {
233-
let nsPath = "/proc/\(containerPID)/ns/mnt"
234-
let vmNsPath = "/proc/self/ns/mnt"
235-
236-
let containerNsStatPtr = UnsafeMutablePointer<stat>.allocate(capacity: 1)
237-
let vmNsStatPtr = UnsafeMutablePointer<stat>.allocate(capacity: 1)
238-
defer {
239-
containerNsStatPtr.deallocate()
240-
vmNsStatPtr.deallocate()
241-
}
242-
243-
let containerStatResult = stat(nsPath, containerNsStatPtr)
244-
let vmStatResult = stat(vmNsPath, vmNsStatPtr)
245-
246-
if containerStatResult == 0 && vmStatResult == 0 {
247-
let containerInode = containerNsStatPtr.pointee.st_ino
248-
let vmInode = vmNsStatPtr.pointee.st_ino
249-
250-
if containerInode == vmInode {
251-
return
252-
}
253-
}
254-
255-
let fd = open(nsPath, O_RDONLY)
256-
guard fd >= 0 else {
257-
throw ContainerizationError(.internalError, message: "Failed to open namespace file: \(nsPath), errno \(errno)")
258-
}
259-
defer {
260-
_ = close(fd)
261-
}
262-
let _ = unshare(CLONE_FS)
263-
let setnsResult = setns(fd, CLONE_NEWNS)
264-
guard setnsResult == 0 else {
265-
throw ContainerizationError(.internalError, message: "Failed to setns to mount namespace: errno \(errno)")
266-
}
102+
guard Musl.mount("tmpfs", "/run", "tmpfs", 0, "") == 0 else {
103+
log.error("failed to mount /run")
104+
exit(1)
267105
}
106+
try Binfmt.mount()
268107

269-
private func readEventFromParent(socket: Int32) throws -> (String, FileSystemEventType)? {
270-
var eventTypeValue: UInt32 = 0
271-
guard read(socket, &eventTypeValue, 4) == 4 else { return nil }
272-
eventTypeValue = UInt32(bigEndian: eventTypeValue)
273-
274-
var pathLen: UInt32 = 0
275-
guard read(socket, &pathLen, 4) == 4 else { return nil }
276-
pathLen = UInt32(bigEndian: pathLen)
277-
278-
let pathData = UnsafeMutablePointer<UInt8>.allocate(capacity: Int(pathLen))
279-
defer { pathData.deallocate() }
280-
guard read(socket, pathData, Int(pathLen)) == pathLen else { return nil }
281-
let pathBytes = Data(bytes: pathData, count: Int(pathLen))
282-
guard let path = String(data: pathBytes, encoding: .utf8) else { return nil }
283-
284-
guard let eventType = FileSystemEventType(rawValue: Int(eventTypeValue)) else {
285-
return nil
286-
}
287-
288-
return (path, eventType)
289-
}
108+
log.logLevel = .debug
290109

291-
private func generateSyntheticInotifyEvent(
292-
path: String,
293-
eventType: FileSystemEventType
294-
) throws {
295-
if eventType == .delete && !FileManager.default.fileExists(atPath: path) {
296-
return
297-
}
110+
log.info("vminitd booting")
111+
let eg = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount)
112+
let server = Initd(log: log, group: eg)
298113

299-
let attributes = try FileManager.default.attributesOfItem(atPath: path)
300-
guard let permissions = attributes[.posixPermissions] as? NSNumber else {
301-
throw ContainerizationError(.internalError, message: "Failed to get file permissions for path: \(path)")
302-
}
303-
try FileManager.default.setAttributes(
304-
[.posixPermissions: permissions],
305-
ofItemAtPath: path
306-
)
114+
do {
115+
log.info("serving vminitd API")
116+
try await server.serve(port: vsockPort)
117+
log.info("vminitd API returned")
118+
} catch {
119+
log.error("vminitd boot error \(error)")
120+
exit(1)
307121
}
308122
}
309123
}

0 commit comments

Comments
 (0)