feat(remote): legible SSH/ACP failures + servers.json export/import

A vanished or misconfigured remote surfaced as an opaque 30s
"ACP request 'initialize' timed out" because the channel's EOF
fired with no exit code or stderr context, and `sh -c` on the
remote couldn't find pipx-installed `hermes` on PATH. This makes
remote failure modes immediately legible and adds a recovery path
for the server registry itself.

- `ACPClientError.processTerminated` now carries exit code + stderr
  tail; `performDisconnectCleanup` reads them from the channel
  before failing pending requests, and `ACPErrorHint.classify`
  recognises Connection refused, Operation timed out, Permission
  denied (publickey), Host key verification failed, Could not
  resolve hostname, and exit 127 / command not found.
- `ProcessACPChannel.terminationHandler` closes the stdout read
  end the moment the OS reaps the child so disconnect cleanup
  fires within ~1s instead of waiting on `availableData`.
  `lastExitCode` reads `Process.terminationStatus` directly to
  avoid an actor-handshake race.
- `SSHTransport.makeProcess` / `streamLines` switch from `sh -c`
  to `bash -lc` so non-interactive SSH shells source the user's
  profile and pick up pipx (`~/.local/bin`), Linuxbrew, asdf,
  and conda PATH entries.
- New `ServerRegistry.exportFile()` / `importEntries(from:)` with
  a `.scarfservers` JSON envelope (schema v1, dedupe by UUID,
  default-server flag preserved). UI in `ManageServersView`'s
  header menu surfaces Export… / Import… via NSSave/OpenPanel.
  No secrets travel — `identityFile` is a path string and SSH
  keys live in `~/.ssh/`, not in `servers.json`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alan Wizemann
2026-04-30 16:04:14 +02:00
parent 4140983866
commit 11946aad67
6 changed files with 353 additions and 24 deletions
@@ -47,6 +47,23 @@ public protocol ACPChannel: Sendable {
/// SSH exec channels return the SSH channel id or `nil` when not
/// applicable.
var diagnosticID: String? { get async }
/// Exit status of the underlying transport once it has terminated.
/// `nil` while the channel is still alive, or for transports that
/// don't have a meaningful integer exit code (Citadel SSH-exec).
/// Read by `ACPClient` when populating `processTerminated` so the
/// user-facing error can name the actual exit code (e.g. `exit
/// 255` for SSH connect failures, `exit 127` for missing remote
/// binary).
var lastExitCode: Int32? { get async }
}
public extension ACPChannel {
/// Default: channels that don't track an exit code report `nil`.
/// Concrete `ProcessACPChannel` overrides this.
var lastExitCode: Int32? {
get async { nil }
}
}
/// Errors raised by `ACPChannel` implementations when the underlying
@@ -468,35 +468,48 @@ public actor ACPClient {
// MARK: - Disconnect Cleanup
/// Single idempotent cleanup path for all disconnect scenarios.
private func performDisconnectCleanup(reason: String) {
/// Captures the channel's exit code + recent stderr BEFORE we drop
/// the reference, so the `processTerminated` error rides with
/// diagnostics the user banner shows "exit 255 ssh: connect to
/// host : Connection refused" instead of a bare opaque timeout.
private func performDisconnectCleanup(reason: String) async {
guard isConnected else { return }
#if canImport(os)
logger.warning("ACP disconnecting: \(reason)")
#endif
let exitCode = await channel?.lastExitCode
let tail = recentStderr
isConnected = false
statusMessage = "Connection lost"
for (_, continuation) in pendingRequests {
continuation.resume(throwing: ACPClientError.processTerminated)
continuation.resume(throwing: ACPClientError.processTerminated(
exitCode: exitCode,
stderrTail: tail
))
}
pendingRequests.removeAll()
eventContinuation?.finish()
eventContinuation = nil
}
private func handleReadLoopEnded(cleanly: Bool, error: Error? = nil) {
private func handleReadLoopEnded(cleanly: Bool, error: Error? = nil) async {
let reason = cleanly ? "read loop ended (EOF)" : "read loop failed: \(error?.localizedDescription ?? "unknown")"
performDisconnectCleanup(reason: reason)
await performDisconnectCleanup(reason: reason)
}
private func handleWriteFailed() {
performDisconnectCleanup(reason: "write failed (broken pipe)")
private func handleWriteFailed() async {
await performDisconnectCleanup(reason: "write failed (broken pipe)")
}
private func handleWriteFailedForRequest(id: Int) {
private func handleWriteFailedForRequest(id: Int) async {
if let continuation = pendingRequests.removeValue(forKey: id) {
continuation.resume(throwing: ACPClientError.processTerminated)
let exitCode = await channel?.lastExitCode
continuation.resume(throwing: ACPClientError.processTerminated(
exitCode: exitCode,
stderrTail: recentStderr
))
}
performDisconnectCleanup(reason: "write failed (broken pipe)")
await performDisconnectCleanup(reason: "write failed (broken pipe)")
}
}
@@ -507,7 +520,7 @@ public enum ACPClientError: Error, LocalizedError {
case encodingFailed
case invalidResponse(String)
case rpcError(code: Int, message: String)
case processTerminated
case processTerminated(exitCode: Int32?, stderrTail: String)
case requestTimeout(method: String)
public var errorDescription: String? {
@@ -516,10 +529,24 @@ public enum ACPClientError: Error, LocalizedError {
case .encodingFailed: return "Failed to encode JSON-RPC request"
case .invalidResponse(let msg): return "Invalid ACP response: \(msg)"
case .rpcError(let code, let msg): return "ACP error \(code): \(msg)"
case .processTerminated: return "ACP process terminated unexpectedly"
case .processTerminated(let exit, let tail):
let exitPart = exit.map { "exit \($0)" } ?? "no exit code"
let tailPart = Self.firstNonEmptyLine(in: tail).map { "\($0)" } ?? ""
return "ACP process terminated unexpectedly (\(exitPart))\(tailPart)"
case .requestTimeout(let method): return "ACP request '\(method)' timed out"
}
}
/// Pluck the first non-empty stderr line for the user-facing
/// summary. Full tail still rides through on `acpErrorDetails`,
/// but the description itself stays single-line.
private static func firstNonEmptyLine(in s: String) -> String? {
for raw in s.split(separator: "\n") {
let line = raw.trimmingCharacters(in: .whitespaces)
if !line.isEmpty { return line }
}
return nil
}
}
/// Maps a raw error message (RPC message or captured stderr) to a short
@@ -528,6 +555,40 @@ public enum ACPClientError: Error, LocalizedError {
public enum ACPErrorHint {
public static func classify(errorMessage: String, stderrTail: String) -> String? {
let haystack = errorMessage + "\n" + stderrTail
// SSH-level failures come first they apply only to remote
// contexts and the patterns are unambiguous (system ssh prints
// them verbatim to stderr). Without these classifications a
// vanished droplet, a wrong key, or a missing remote `hermes`
// all surface as opaque "ACP process terminated" / "request
// timed out", and the user has no idea where to look.
if haystack.contains("Connection refused") {
return "Couldn't reach the remote host — the SSH port is closed or the droplet is down. Check the host is running and reachable."
}
if haystack.localizedCaseInsensitiveContains("Operation timed out")
|| haystack.localizedCaseInsensitiveContains("Connection timed out")
|| haystack.contains("Network is unreachable")
|| haystack.contains("No route to host") {
return "Couldn't reach the remote host — the network connection timed out. Check the host is running and your network is up."
}
if haystack.contains("Permission denied (publickey")
|| haystack.contains("Permission denied, please try again") {
return "SSH rejected the key. Make sure the right identity file is selected and that ssh-agent has the key loaded — open Terminal and run `ssh-add -l`."
}
if haystack.contains("Host key verification failed")
|| haystack.contains("REMOTE HOST IDENTIFICATION HAS CHANGED") {
return "The remote host's SSH key changed. If you just rebuilt the droplet, remove the old entry with `ssh-keygen -R <host>`, then try again."
}
if haystack.contains("Could not resolve hostname")
|| haystack.contains("Name or service not known") {
return "Couldn't resolve the host name. Check the host in this server's settings."
}
if haystack.localizedCaseInsensitiveContains("command not found")
|| haystack.contains("hermes: not found")
|| haystack.contains("exit 127") {
return "The remote shell couldn't find `hermes`. Either install Hermes on the remote (`pipx install hermes-agent`) or set an absolute binary path in this server's settings."
}
if haystack.range(of: #"No\s+(Anthropic|OpenAI|OpenRouter|Gemini|Google|Groq|Mistral|XAI)?\s*credentials\s+found"#,
options: .regularExpression) != nil
|| haystack.contains("ANTHROPIC_API_KEY")
@@ -36,6 +36,17 @@ public actor ProcessACPChannel: ACPChannel {
private var readerTask: Task<Void, Never>?
private var stderrTask: Task<Void, Never>?
/// Read by `ACPClient` to fill in `processTerminated(exitCode:)`
/// so the error names the actual exit code rather than reporting a
/// bare timeout. Sourced directly from `Process` `Process` is
/// thread-safe for this read and reflects the actual reap state,
/// so we sidestep the race between the OS-side `terminationHandler`
/// callback and the EOF-driven disconnect cleanup that would
/// otherwise need an atomic to coordinate.
public var lastExitCode: Int32? {
process.isRunning ? nil : process.terminationStatus
}
/// The subprocess's PID as a human-readable string.
public var diagnosticID: String? {
"pid=\(process.processIdentifier)"
@@ -58,7 +69,7 @@ public actor ProcessACPChannel: ACPChannel {
proc.executableURL = URL(fileURLWithPath: executable)
proc.arguments = args
proc.environment = env
try await Self.launch(process: proc, self_: nil)
try await Self.launch(process: proc)
try Self.ignoreSIGPIPE_once()
self.process = proc
@@ -76,13 +87,14 @@ public actor ProcessACPChannel: ACPChannel {
self.stderrContinuation = errContinuation
await startReaders()
installTerminationHandler()
}
/// Secondary entry point for callers that have a pre-configured
/// `Process` (typically from `SSHTransport.makeProcess`). The process
/// must NOT already be running this initializer calls `run()`.
public init(process: Process) async throws {
try await Self.launch(process: process, self_: nil)
try await Self.launch(process: process)
try Self.ignoreSIGPIPE_once()
self.process = process
@@ -100,14 +112,12 @@ public actor ProcessACPChannel: ACPChannel {
self.stderrContinuation = errContinuation
await startReaders()
installTerminationHandler()
}
/// Wire fresh stdin/stdout/stderr pipes (overwriting any the caller
/// set) and start the subprocess. `self_` is unused today the
/// placeholder keeps the signature ready for a future hook that
/// captures termination in `proc.terminationHandler` and routes it
/// into the channel's actor state.
private static func launch(process: Process, self_: Any?) async throws {
/// set) and start the subprocess.
private static func launch(process: Process) async throws {
process.standardInput = Pipe()
process.standardOutput = Pipe()
process.standardError = Pipe()
@@ -118,6 +128,22 @@ public actor ProcessACPChannel: ACPChannel {
}
}
/// Install a `terminationHandler` that closes the stdout read end
/// the moment the OS reaps the child. Without this, the reader
/// loop's `availableData` keeps blocking until the kernel tears
/// the pipe down on its own schedule visible to the user as a
/// 30s ACP `initialize` timeout where a fast SSH-side failure
/// (Connection refused, exit 127) should surface in under a
/// second. The exit code itself is read on demand from
/// `Process.terminationStatus` (see `lastExitCode`), so this
/// callback doesn't need to touch actor state.
private func installTerminationHandler() {
let stdoutFh = stdoutPipe.fileHandleForReading
process.terminationHandler = { _ in
try? stdoutFh.close()
}
}
/// Ignore SIGPIPE once per process so a broken-pipe write returns
/// `EPIPE` (which we surface as `.writeEndClosed`) instead of
/// delivering SIGPIPE and tearing the app down. Idempotent; the
@@ -425,14 +425,18 @@ public struct SSHTransport: ServerTransport {
public func makeProcess(executable: String, args: [String]) -> Process {
ensureControlDir()
// `-T` disables pty allocation critical for binary-clean stdin/stdout
// (ACP JSON-RPC, log tail bytes). Same sh -c wrapping as runProcess
// so home-relative paths in `executable`/`args` actually expand.
// (ACP JSON-RPC, log tail bytes). `bash -lc` (login shell) sources the
// user's profile so PATH picks up pipx's `~/.local/bin`, Homebrew on
// Linux, asdf shims, and conda envs. Plain `sh -c` is non-login, so
// pipx-installed `hermes` isn't on PATH unless `hermesBinaryHint` was
// set explicitly exactly the failure that surfaces as a
// "command not found" / opaque init timeout against fresh droplets.
let cmd = ([executable] + args).map { Self.remotePathArg($0) }.joined(separator: " ")
var sshArgv = sshArgs()
sshArgv.insert("-T", at: 0)
sshArgv.append(hostSpec)
sshArgv.append("sh")
sshArgv.append("-c")
sshArgv.append("bash")
sshArgv.append("-lc")
sshArgv.append(Self.shellQuote(cmd))
let proc = Process()
proc.executableURL = URL(fileURLWithPath: sshBinary)
@@ -453,12 +457,17 @@ public struct SSHTransport: ServerTransport {
return AsyncThrowingStream { continuation in
Task.detached { [self] in
ensureControlDir()
// `bash -lc` (login shell) so PATH picks up profile-only
// entries like pipx's `~/.local/bin` same rationale as
// `makeProcess` above. Streaming consumers (log tails)
// don't tolerate a missing-binary failure any better than
// ACP does.
let cmd = ([executable] + args).map { Self.remotePathArg($0) }.joined(separator: " ")
var sshArgv = sshArgs()
sshArgv.insert("-T", at: 0)
sshArgv.append(hostSpec)
sshArgv.append("sh")
sshArgv.append("-c")
sshArgv.append("bash")
sshArgv.append("-lc")
sshArgv.append(Self.shellQuote(cmd))
let proc = Process()
proc.executableURL = URL(fileURLWithPath: sshBinary)
@@ -165,6 +165,119 @@ final class ServerRegistry {
SSHTransport.sweepStaleControlSockets()
}
// MARK: - Export / Import
/// Result summary returned from `importEntries(from:)`. The UI renders
/// it as a one-line confirmation so the user knows whether anything
/// changed (e.g. picking a stale export file imports zero entries
/// because every ID is already present).
struct ImportSummary: Equatable {
var imported: Int
var skippedDuplicates: Int
}
/// Errors raised by `importEntries(from:)` for the user-facing alert.
/// Validation is conservative we'd rather refuse a malformed file
/// than half-import garbage and leave the registry in a weird state.
enum ImportError: Error, LocalizedError {
case unreadable(String)
case malformed(String)
case unsupportedSchema(Int)
var errorDescription: String? {
switch self {
case .unreadable(let m): return "Couldn't read the file: \(m)"
case .malformed(let m): return "The file isn't a valid Scarf servers export: \(m)"
case .unsupportedSchema(let v): return "This export uses schema v\(v), which this version of Scarf doesn't recognize."
}
}
}
/// Encode the current registry as a portable export. `displayName`,
/// `host`, `user`, `port`, `identityFile` (path string only),
/// `remoteHome`, `projectsRoot`, `hermesBinaryHint`, `openOnLaunch`,
/// and the entry's stable UUID travel. **No secrets** ride along
/// SSH private keys live at the path referenced by `identityFile`,
/// not in `servers.json`. Importing on a different Mac requires the
/// user to copy their `~/.ssh/` keys separately (or re-point each
/// entry's identityFile in Edit Server).
func exportFile() throws -> Data {
let payload = ExportFile(
schemaVersion: Self.currentSchemaVersion,
exportedAt: ISO8601DateFormatter().string(from: Date()),
entries: entries
)
let encoder = JSONEncoder()
encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
return try encoder.encode(payload)
}
/// Merge entries from a `.scarfservers` file. Dedupe is by UUID
/// entries whose ID already exists are skipped (the existing
/// entry wins, since it may carry edits the user made post-export).
/// `openOnLaunch` is normalized after import: at most one entry
/// can be the default, and conflicts resolve in favor of the
/// pre-existing default.
@discardableResult
func importEntries(from data: Data) throws -> ImportSummary {
let payload: ExportFile
do {
payload = try JSONDecoder().decode(ExportFile.self, from: data)
} catch {
throw ImportError.malformed(error.localizedDescription)
}
guard payload.schemaVersion == Self.currentSchemaVersion else {
throw ImportError.unsupportedSchema(payload.schemaVersion)
}
let existingIDs = Set(entries.map(\.id))
var imported = 0
var skipped = 0
for incoming in payload.entries {
if existingIDs.contains(incoming.id) {
skipped += 1
continue
}
var copy = incoming
// Don't let an imported entry seize the default slot if the
// user already has one assigned. Normalization below also
// drops `openOnLaunch` if more than one survives.
if entries.contains(where: { $0.openOnLaunch }) {
copy.openOnLaunch = false
}
entries.append(copy)
imported += 1
}
// Belt-and-suspenders: if multiple entries somehow ended up
// flagged as default (e.g. user imported an export that itself
// had the flag on a different entry than the local default),
// keep only the first one.
var sawDefault = false
for idx in entries.indices {
if entries[idx].openOnLaunch {
if sawDefault { entries[idx].openOnLaunch = false }
else { sawDefault = true }
}
}
save()
if imported > 0 { onEntriesChanged?() }
return ImportSummary(imported: imported, skippedDuplicates: skipped)
}
/// Disk envelope distinct from `RegistryFile`. Adds the export
/// timestamp; structurally compatible so a hand-edited export
/// could in theory be dropped at `~/Library/Application
/// Support/scarf/servers.json` and load we don't rely on that,
/// but keeping the shape close means one less migration surface
/// when we eventually add fields here.
private struct ExportFile: Codable {
var schemaVersion: Int
var exportedAt: String
var entries: [ServerEntry]
}
// MARK: - Persistence
private func load() {
@@ -1,6 +1,8 @@
import SwiftUI
import ScarfCore
import ScarfDesign
import UniformTypeIdentifiers
import AppKit
/// List of registered remote servers with add/remove actions. Rendered as a
/// popover from the toolbar switcher.
@@ -9,6 +11,16 @@ struct ManageServersView: View {
@State private var showAddSheet = false
@State private var pendingRemoveID: ServerID?
@State private var diagnosticsContext: ServerContext?
@State private var importAlert: ImportAlertState?
/// Lightweight wrapper around the after-import message so we can
/// present a single SwiftUI `.alert` for both success summaries
/// ("Imported 3 servers") and refusals ("Schema v2 not recognized").
private struct ImportAlertState: Identifiable {
var id = UUID()
var title: String
var message: String
}
var body: some View {
VStack(alignment: .leading, spacing: 0) {
@@ -49,6 +61,9 @@ struct ManageServersView: View {
Text("The server's SSH configuration is removed from Scarf. Your remote files are untouched.")
}
)
.alert(item: $importAlert) { state in
Alert(title: Text(state.title), message: Text(state.message), dismissButton: .default(Text("OK")))
}
}
/// Wrapper because `ServerContext` isn't `Identifiable` against the sheet
@@ -62,6 +77,17 @@ struct ManageServersView: View {
HStack {
Text("Servers").scarfStyle(.headline)
Spacer()
Menu {
Button("Export Servers…") { exportServers() }
.disabled(registry.entries.isEmpty)
Button("Import Servers…") { importServers() }
} label: {
Image(systemName: "ellipsis.circle")
}
.menuStyle(.borderlessButton)
.menuIndicator(.hidden)
.fixedSize()
.help("Export or import the list of remote servers. SSH keys aren't included — you copy those separately.")
Button {
showAddSheet = true
} label: {
@@ -72,6 +98,83 @@ struct ManageServersView: View {
.padding(12)
}
/// `.scarfservers` is a plain JSON file (`ServerRegistry.exportFile()`).
/// Declared inline so callers don't need a shared UTType module just to
/// open one save panel. The conformance is dual: also `.json` so users
/// renaming the file don't break the import handler.
private static let scarfServersType: UTType = {
if let t = UTType("com.scarf.servers") { return t }
return UTType.json
}()
private func exportServers() {
let panel = NSSavePanel()
panel.title = "Export Servers"
panel.prompt = "Export"
panel.allowedContentTypes = [Self.scarfServersType, .json]
panel.nameFieldStringValue = "scarf-servers-\(Self.todayStamp()).scarfservers"
panel.canCreateDirectories = true
panel.isExtensionHidden = false
guard panel.runModal() == .OK, let url = panel.url else { return }
do {
let data = try registry.exportFile()
try data.write(to: url, options: .atomic)
} catch {
importAlert = ImportAlertState(
title: "Couldn't export servers",
message: error.localizedDescription
)
}
}
private func importServers() {
let panel = NSOpenPanel()
panel.title = "Import Servers"
panel.prompt = "Import"
panel.allowedContentTypes = [Self.scarfServersType, .json]
panel.allowsMultipleSelection = false
panel.canChooseDirectories = false
guard panel.runModal() == .OK, let url = panel.url else { return }
do {
let data = try Data(contentsOf: url)
let summary = try registry.importEntries(from: data)
let count = summary.imported
let skipped = summary.skippedDuplicates
let title = count == 0 && skipped > 0
? "Nothing to import"
: (count == 1 ? "Imported 1 server" : "Imported \(count) servers")
var lines: [String] = []
if count == 0 && skipped > 0 {
lines.append("Every entry was already in your registry. Nothing changed.")
} else if skipped > 0 {
lines.append("\(skipped) duplicate \(skipped == 1 ? "entry was" : "entries were") skipped — your existing copy is preserved.")
}
lines.append("SSH keys aren't included in the export — make sure your `~/.ssh/` keys are in place on this Mac, or edit each server to point at the right identity file.")
importAlert = ImportAlertState(title: title, message: lines.joined(separator: "\n\n"))
} catch let err as ServerRegistry.ImportError {
importAlert = ImportAlertState(
title: "Couldn't import servers",
message: err.localizedDescription
)
} catch {
importAlert = ImportAlertState(
title: "Couldn't import servers",
message: error.localizedDescription
)
}
}
/// `yyyy-MM-dd` so the exported filename sorts naturally in Finder
/// when a user accumulates rotating exports.
private static func todayStamp() -> String {
let f = DateFormatter()
f.calendar = Calendar(identifier: .iso8601)
f.locale = Locale(identifier: "en_US_POSIX")
f.timeZone = TimeZone(identifier: "UTC")
f.dateFormat = "yyyy-MM-dd"
return f.string(from: Date())
}
private var empty: some View {
VStack(spacing: 8) {
Image(systemName: "server.rack")