feat(remote): legible SSH/ACP failures + servers.json export/import

A vanished or misconfigured remote surfaced as an opaque 30s
"ACP request 'initialize' timed out" because the channel's EOF
fired with no exit code or stderr context, and `sh -c` on the
remote couldn't find pipx-installed `hermes` on PATH. This makes
remote failure modes immediately legible and adds a recovery path
for the server registry itself.

- `ACPClientError.processTerminated` now carries exit code + stderr
  tail; `performDisconnectCleanup` reads them from the channel
  before failing pending requests, and `ACPErrorHint.classify`
  recognises Connection refused, Operation timed out, Permission
  denied (publickey), Host key verification failed, Could not
  resolve hostname, and exit 127 / command not found.
- `ProcessACPChannel.terminationHandler` closes the stdout read
  end the moment the OS reaps the child so disconnect cleanup
  fires within ~1s instead of waiting on `availableData`.
  `lastExitCode` reads `Process.terminationStatus` directly to
  avoid an actor-handshake race.
- `SSHTransport.makeProcess` / `streamLines` switch from `sh -c`
  to `bash -lc` so non-interactive SSH shells source the user's
  profile and pick up pipx (`~/.local/bin`), Linuxbrew, asdf,
  and conda PATH entries.
- New `ServerRegistry.exportFile()` / `importEntries(from:)` with
  a `.scarfservers` JSON envelope (schema v1, dedupe by UUID,
  default-server flag preserved). UI in `ManageServersView`'s
  header menu surfaces Export… / Import… via NSSave/OpenPanel.
  No secrets travel — `identityFile` is a path string and SSH
  keys live in `~/.ssh/`, not in `servers.json`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alan Wizemann
2026-04-30 16:04:14 +02:00
parent 4140983866
commit 11946aad67
6 changed files with 353 additions and 24 deletions
@@ -47,6 +47,23 @@ public protocol ACPChannel: Sendable {
/// SSH exec channels return the SSH channel id or `nil` when not
/// applicable.
var diagnosticID: String? { get async }
/// Exit status of the underlying transport once it has terminated.
/// `nil` while the channel is still alive, or for transports that
/// don't have a meaningful integer exit code (Citadel SSH-exec).
/// Read by `ACPClient` when populating `processTerminated` so the
/// user-facing error can name the actual exit code (e.g. `exit
/// 255` for SSH connect failures, `exit 127` for missing remote
/// binary).
var lastExitCode: Int32? { get async }
}
public extension ACPChannel {
/// Default: channels that don't track an exit code report `nil`.
/// Concrete `ProcessACPChannel` overrides this.
var lastExitCode: Int32? {
get async { nil }
}
}
/// Errors raised by `ACPChannel` implementations when the underlying
@@ -468,35 +468,48 @@ public actor ACPClient {
// MARK: - Disconnect Cleanup
/// Single idempotent cleanup path for all disconnect scenarios.
private func performDisconnectCleanup(reason: String) {
/// Captures the channel's exit code + recent stderr BEFORE we drop
/// the reference, so the `processTerminated` error rides with
/// diagnostics the user banner shows "exit 255 ssh: connect to
/// host : Connection refused" instead of a bare opaque timeout.
private func performDisconnectCleanup(reason: String) async {
guard isConnected else { return }
#if canImport(os)
logger.warning("ACP disconnecting: \(reason)")
#endif
let exitCode = await channel?.lastExitCode
let tail = recentStderr
isConnected = false
statusMessage = "Connection lost"
for (_, continuation) in pendingRequests {
continuation.resume(throwing: ACPClientError.processTerminated)
continuation.resume(throwing: ACPClientError.processTerminated(
exitCode: exitCode,
stderrTail: tail
))
}
pendingRequests.removeAll()
eventContinuation?.finish()
eventContinuation = nil
}
private func handleReadLoopEnded(cleanly: Bool, error: Error? = nil) {
private func handleReadLoopEnded(cleanly: Bool, error: Error? = nil) async {
let reason = cleanly ? "read loop ended (EOF)" : "read loop failed: \(error?.localizedDescription ?? "unknown")"
performDisconnectCleanup(reason: reason)
await performDisconnectCleanup(reason: reason)
}
private func handleWriteFailed() {
performDisconnectCleanup(reason: "write failed (broken pipe)")
private func handleWriteFailed() async {
await performDisconnectCleanup(reason: "write failed (broken pipe)")
}
private func handleWriteFailedForRequest(id: Int) {
private func handleWriteFailedForRequest(id: Int) async {
if let continuation = pendingRequests.removeValue(forKey: id) {
continuation.resume(throwing: ACPClientError.processTerminated)
let exitCode = await channel?.lastExitCode
continuation.resume(throwing: ACPClientError.processTerminated(
exitCode: exitCode,
stderrTail: recentStderr
))
}
performDisconnectCleanup(reason: "write failed (broken pipe)")
await performDisconnectCleanup(reason: "write failed (broken pipe)")
}
}
@@ -507,7 +520,7 @@ public enum ACPClientError: Error, LocalizedError {
case encodingFailed
case invalidResponse(String)
case rpcError(code: Int, message: String)
case processTerminated
case processTerminated(exitCode: Int32?, stderrTail: String)
case requestTimeout(method: String)
public var errorDescription: String? {
@@ -516,10 +529,24 @@ public enum ACPClientError: Error, LocalizedError {
case .encodingFailed: return "Failed to encode JSON-RPC request"
case .invalidResponse(let msg): return "Invalid ACP response: \(msg)"
case .rpcError(let code, let msg): return "ACP error \(code): \(msg)"
case .processTerminated: return "ACP process terminated unexpectedly"
case .processTerminated(let exit, let tail):
let exitPart = exit.map { "exit \($0)" } ?? "no exit code"
let tailPart = Self.firstNonEmptyLine(in: tail).map { "\($0)" } ?? ""
return "ACP process terminated unexpectedly (\(exitPart))\(tailPart)"
case .requestTimeout(let method): return "ACP request '\(method)' timed out"
}
}
/// Pluck the first non-empty stderr line for the user-facing
/// summary. Full tail still rides through on `acpErrorDetails`,
/// but the description itself stays single-line.
private static func firstNonEmptyLine(in s: String) -> String? {
for raw in s.split(separator: "\n") {
let line = raw.trimmingCharacters(in: .whitespaces)
if !line.isEmpty { return line }
}
return nil
}
}
/// Maps a raw error message (RPC message or captured stderr) to a short
@@ -528,6 +555,40 @@ public enum ACPClientError: Error, LocalizedError {
public enum ACPErrorHint {
public static func classify(errorMessage: String, stderrTail: String) -> String? {
let haystack = errorMessage + "\n" + stderrTail
// SSH-level failures come first they apply only to remote
// contexts and the patterns are unambiguous (system ssh prints
// them verbatim to stderr). Without these classifications a
// vanished droplet, a wrong key, or a missing remote `hermes`
// all surface as opaque "ACP process terminated" / "request
// timed out", and the user has no idea where to look.
if haystack.contains("Connection refused") {
return "Couldn't reach the remote host — the SSH port is closed or the droplet is down. Check the host is running and reachable."
}
if haystack.localizedCaseInsensitiveContains("Operation timed out")
|| haystack.localizedCaseInsensitiveContains("Connection timed out")
|| haystack.contains("Network is unreachable")
|| haystack.contains("No route to host") {
return "Couldn't reach the remote host — the network connection timed out. Check the host is running and your network is up."
}
if haystack.contains("Permission denied (publickey")
|| haystack.contains("Permission denied, please try again") {
return "SSH rejected the key. Make sure the right identity file is selected and that ssh-agent has the key loaded — open Terminal and run `ssh-add -l`."
}
if haystack.contains("Host key verification failed")
|| haystack.contains("REMOTE HOST IDENTIFICATION HAS CHANGED") {
return "The remote host's SSH key changed. If you just rebuilt the droplet, remove the old entry with `ssh-keygen -R <host>`, then try again."
}
if haystack.contains("Could not resolve hostname")
|| haystack.contains("Name or service not known") {
return "Couldn't resolve the host name. Check the host in this server's settings."
}
if haystack.localizedCaseInsensitiveContains("command not found")
|| haystack.contains("hermes: not found")
|| haystack.contains("exit 127") {
return "The remote shell couldn't find `hermes`. Either install Hermes on the remote (`pipx install hermes-agent`) or set an absolute binary path in this server's settings."
}
if haystack.range(of: #"No\s+(Anthropic|OpenAI|OpenRouter|Gemini|Google|Groq|Mistral|XAI)?\s*credentials\s+found"#,
options: .regularExpression) != nil
|| haystack.contains("ANTHROPIC_API_KEY")
@@ -36,6 +36,17 @@ public actor ProcessACPChannel: ACPChannel {
private var readerTask: Task<Void, Never>?
private var stderrTask: Task<Void, Never>?
/// Read by `ACPClient` to fill in `processTerminated(exitCode:)`
/// so the error names the actual exit code rather than reporting a
/// bare timeout. Sourced directly from `Process` `Process` is
/// thread-safe for this read and reflects the actual reap state,
/// so we sidestep the race between the OS-side `terminationHandler`
/// callback and the EOF-driven disconnect cleanup that would
/// otherwise need an atomic to coordinate.
public var lastExitCode: Int32? {
process.isRunning ? nil : process.terminationStatus
}
/// The subprocess's PID as a human-readable string.
public var diagnosticID: String? {
"pid=\(process.processIdentifier)"
@@ -58,7 +69,7 @@ public actor ProcessACPChannel: ACPChannel {
proc.executableURL = URL(fileURLWithPath: executable)
proc.arguments = args
proc.environment = env
try await Self.launch(process: proc, self_: nil)
try await Self.launch(process: proc)
try Self.ignoreSIGPIPE_once()
self.process = proc
@@ -76,13 +87,14 @@ public actor ProcessACPChannel: ACPChannel {
self.stderrContinuation = errContinuation
await startReaders()
installTerminationHandler()
}
/// Secondary entry point for callers that have a pre-configured
/// `Process` (typically from `SSHTransport.makeProcess`). The process
/// must NOT already be running this initializer calls `run()`.
public init(process: Process) async throws {
try await Self.launch(process: process, self_: nil)
try await Self.launch(process: process)
try Self.ignoreSIGPIPE_once()
self.process = process
@@ -100,14 +112,12 @@ public actor ProcessACPChannel: ACPChannel {
self.stderrContinuation = errContinuation
await startReaders()
installTerminationHandler()
}
/// Wire fresh stdin/stdout/stderr pipes (overwriting any the caller
/// set) and start the subprocess. `self_` is unused today the
/// placeholder keeps the signature ready for a future hook that
/// captures termination in `proc.terminationHandler` and routes it
/// into the channel's actor state.
private static func launch(process: Process, self_: Any?) async throws {
/// set) and start the subprocess.
private static func launch(process: Process) async throws {
process.standardInput = Pipe()
process.standardOutput = Pipe()
process.standardError = Pipe()
@@ -118,6 +128,22 @@ public actor ProcessACPChannel: ACPChannel {
}
}
/// Install a `terminationHandler` that closes the stdout read end
/// the moment the OS reaps the child. Without this, the reader
/// loop's `availableData` keeps blocking until the kernel tears
/// the pipe down on its own schedule visible to the user as a
/// 30s ACP `initialize` timeout where a fast SSH-side failure
/// (Connection refused, exit 127) should surface in under a
/// second. The exit code itself is read on demand from
/// `Process.terminationStatus` (see `lastExitCode`), so this
/// callback doesn't need to touch actor state.
private func installTerminationHandler() {
let stdoutFh = stdoutPipe.fileHandleForReading
process.terminationHandler = { _ in
try? stdoutFh.close()
}
}
/// Ignore SIGPIPE once per process so a broken-pipe write returns
/// `EPIPE` (which we surface as `.writeEndClosed`) instead of
/// delivering SIGPIPE and tearing the app down. Idempotent; the
@@ -425,14 +425,18 @@ public struct SSHTransport: ServerTransport {
public func makeProcess(executable: String, args: [String]) -> Process {
ensureControlDir()
// `-T` disables pty allocation critical for binary-clean stdin/stdout
// (ACP JSON-RPC, log tail bytes). Same sh -c wrapping as runProcess
// so home-relative paths in `executable`/`args` actually expand.
// (ACP JSON-RPC, log tail bytes). `bash -lc` (login shell) sources the
// user's profile so PATH picks up pipx's `~/.local/bin`, Homebrew on
// Linux, asdf shims, and conda envs. Plain `sh -c` is non-login, so
// pipx-installed `hermes` isn't on PATH unless `hermesBinaryHint` was
// set explicitly exactly the failure that surfaces as a
// "command not found" / opaque init timeout against fresh droplets.
let cmd = ([executable] + args).map { Self.remotePathArg($0) }.joined(separator: " ")
var sshArgv = sshArgs()
sshArgv.insert("-T", at: 0)
sshArgv.append(hostSpec)
sshArgv.append("sh")
sshArgv.append("-c")
sshArgv.append("bash")
sshArgv.append("-lc")
sshArgv.append(Self.shellQuote(cmd))
let proc = Process()
proc.executableURL = URL(fileURLWithPath: sshBinary)
@@ -453,12 +457,17 @@ public struct SSHTransport: ServerTransport {
return AsyncThrowingStream { continuation in
Task.detached { [self] in
ensureControlDir()
// `bash -lc` (login shell) so PATH picks up profile-only
// entries like pipx's `~/.local/bin` same rationale as
// `makeProcess` above. Streaming consumers (log tails)
// don't tolerate a missing-binary failure any better than
// ACP does.
let cmd = ([executable] + args).map { Self.remotePathArg($0) }.joined(separator: " ")
var sshArgv = sshArgs()
sshArgv.insert("-T", at: 0)
sshArgv.append(hostSpec)
sshArgv.append("sh")
sshArgv.append("-c")
sshArgv.append("bash")
sshArgv.append("-lc")
sshArgv.append(Self.shellQuote(cmd))
let proc = Process()
proc.executableURL = URL(fileURLWithPath: sshBinary)