mirror of
https://github.com/awizemann/scarf.git
synced 2026-05-10 18:44:45 +00:00
feat(remote): legible SSH/ACP failures + servers.json export/import
A vanished or misconfigured remote surfaced as an opaque 30s "ACP request 'initialize' timed out" because the channel's EOF fired with no exit code or stderr context, and `sh -c` on the remote couldn't find pipx-installed `hermes` on PATH. This makes remote failure modes immediately legible and adds a recovery path for the server registry itself. - `ACPClientError.processTerminated` now carries exit code + stderr tail; `performDisconnectCleanup` reads them from the channel before failing pending requests, and `ACPErrorHint.classify` recognises Connection refused, Operation timed out, Permission denied (publickey), Host key verification failed, Could not resolve hostname, and exit 127 / command not found. - `ProcessACPChannel.terminationHandler` closes the stdout read end the moment the OS reaps the child so disconnect cleanup fires within ~1s instead of waiting on `availableData`. `lastExitCode` reads `Process.terminationStatus` directly to avoid an actor-handshake race. - `SSHTransport.makeProcess` / `streamLines` switch from `sh -c` to `bash -lc` so non-interactive SSH shells source the user's profile and pick up pipx (`~/.local/bin`), Linuxbrew, asdf, and conda PATH entries. - New `ServerRegistry.exportFile()` / `importEntries(from:)` with a `.scarfservers` JSON envelope (schema v1, dedupe by UUID, default-server flag preserved). UI in `ManageServersView`'s header menu surfaces Export… / Import… via NSSave/OpenPanel. No secrets travel — `identityFile` is a path string and SSH keys live in `~/.ssh/`, not in `servers.json`. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -47,6 +47,23 @@ public protocol ACPChannel: Sendable {
|
||||
/// SSH exec channels return the SSH channel id or `nil` when not
|
||||
/// applicable.
|
||||
var diagnosticID: String? { get async }
|
||||
|
||||
/// Exit status of the underlying transport once it has terminated.
|
||||
/// `nil` while the channel is still alive, or for transports that
|
||||
/// don't have a meaningful integer exit code (Citadel SSH-exec).
|
||||
/// Read by `ACPClient` when populating `processTerminated` so the
|
||||
/// user-facing error can name the actual exit code (e.g. `exit
|
||||
/// 255` for SSH connect failures, `exit 127` for missing remote
|
||||
/// binary).
|
||||
var lastExitCode: Int32? { get async }
|
||||
}
|
||||
|
||||
public extension ACPChannel {
|
||||
/// Default: channels that don't track an exit code report `nil`.
|
||||
/// Concrete `ProcessACPChannel` overrides this.
|
||||
var lastExitCode: Int32? {
|
||||
get async { nil }
|
||||
}
|
||||
}
|
||||
|
||||
/// Errors raised by `ACPChannel` implementations when the underlying
|
||||
|
||||
@@ -468,35 +468,48 @@ public actor ACPClient {
|
||||
// MARK: - Disconnect Cleanup
|
||||
|
||||
/// Single idempotent cleanup path for all disconnect scenarios.
|
||||
private func performDisconnectCleanup(reason: String) {
|
||||
/// Captures the channel's exit code + recent stderr BEFORE we drop
|
||||
/// the reference, so the `processTerminated` error rides with
|
||||
/// diagnostics — the user banner shows "exit 255 — ssh: connect to
|
||||
/// host …: Connection refused" instead of a bare opaque timeout.
|
||||
private func performDisconnectCleanup(reason: String) async {
|
||||
guard isConnected else { return }
|
||||
#if canImport(os)
|
||||
logger.warning("ACP disconnecting: \(reason)")
|
||||
#endif
|
||||
let exitCode = await channel?.lastExitCode
|
||||
let tail = recentStderr
|
||||
isConnected = false
|
||||
statusMessage = "Connection lost"
|
||||
for (_, continuation) in pendingRequests {
|
||||
continuation.resume(throwing: ACPClientError.processTerminated)
|
||||
continuation.resume(throwing: ACPClientError.processTerminated(
|
||||
exitCode: exitCode,
|
||||
stderrTail: tail
|
||||
))
|
||||
}
|
||||
pendingRequests.removeAll()
|
||||
eventContinuation?.finish()
|
||||
eventContinuation = nil
|
||||
}
|
||||
|
||||
private func handleReadLoopEnded(cleanly: Bool, error: Error? = nil) {
|
||||
private func handleReadLoopEnded(cleanly: Bool, error: Error? = nil) async {
|
||||
let reason = cleanly ? "read loop ended (EOF)" : "read loop failed: \(error?.localizedDescription ?? "unknown")"
|
||||
performDisconnectCleanup(reason: reason)
|
||||
await performDisconnectCleanup(reason: reason)
|
||||
}
|
||||
|
||||
private func handleWriteFailed() {
|
||||
performDisconnectCleanup(reason: "write failed (broken pipe)")
|
||||
private func handleWriteFailed() async {
|
||||
await performDisconnectCleanup(reason: "write failed (broken pipe)")
|
||||
}
|
||||
|
||||
private func handleWriteFailedForRequest(id: Int) {
|
||||
private func handleWriteFailedForRequest(id: Int) async {
|
||||
if let continuation = pendingRequests.removeValue(forKey: id) {
|
||||
continuation.resume(throwing: ACPClientError.processTerminated)
|
||||
let exitCode = await channel?.lastExitCode
|
||||
continuation.resume(throwing: ACPClientError.processTerminated(
|
||||
exitCode: exitCode,
|
||||
stderrTail: recentStderr
|
||||
))
|
||||
}
|
||||
performDisconnectCleanup(reason: "write failed (broken pipe)")
|
||||
await performDisconnectCleanup(reason: "write failed (broken pipe)")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -507,7 +520,7 @@ public enum ACPClientError: Error, LocalizedError {
|
||||
case encodingFailed
|
||||
case invalidResponse(String)
|
||||
case rpcError(code: Int, message: String)
|
||||
case processTerminated
|
||||
case processTerminated(exitCode: Int32?, stderrTail: String)
|
||||
case requestTimeout(method: String)
|
||||
|
||||
public var errorDescription: String? {
|
||||
@@ -516,10 +529,24 @@ public enum ACPClientError: Error, LocalizedError {
|
||||
case .encodingFailed: return "Failed to encode JSON-RPC request"
|
||||
case .invalidResponse(let msg): return "Invalid ACP response: \(msg)"
|
||||
case .rpcError(let code, let msg): return "ACP error \(code): \(msg)"
|
||||
case .processTerminated: return "ACP process terminated unexpectedly"
|
||||
case .processTerminated(let exit, let tail):
|
||||
let exitPart = exit.map { "exit \($0)" } ?? "no exit code"
|
||||
let tailPart = Self.firstNonEmptyLine(in: tail).map { " — \($0)" } ?? ""
|
||||
return "ACP process terminated unexpectedly (\(exitPart))\(tailPart)"
|
||||
case .requestTimeout(let method): return "ACP request '\(method)' timed out"
|
||||
}
|
||||
}
|
||||
|
||||
/// Pluck the first non-empty stderr line for the user-facing
|
||||
/// summary. Full tail still rides through on `acpErrorDetails`,
|
||||
/// but the description itself stays single-line.
|
||||
private static func firstNonEmptyLine(in s: String) -> String? {
|
||||
for raw in s.split(separator: "\n") {
|
||||
let line = raw.trimmingCharacters(in: .whitespaces)
|
||||
if !line.isEmpty { return line }
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
/// Maps a raw error message (RPC message or captured stderr) to a short
|
||||
@@ -528,6 +555,40 @@ public enum ACPClientError: Error, LocalizedError {
|
||||
public enum ACPErrorHint {
|
||||
public static func classify(errorMessage: String, stderrTail: String) -> String? {
|
||||
let haystack = errorMessage + "\n" + stderrTail
|
||||
|
||||
// SSH-level failures come first — they apply only to remote
|
||||
// contexts and the patterns are unambiguous (system ssh prints
|
||||
// them verbatim to stderr). Without these classifications a
|
||||
// vanished droplet, a wrong key, or a missing remote `hermes`
|
||||
// all surface as opaque "ACP process terminated" / "request
|
||||
// timed out", and the user has no idea where to look.
|
||||
if haystack.contains("Connection refused") {
|
||||
return "Couldn't reach the remote host — the SSH port is closed or the droplet is down. Check the host is running and reachable."
|
||||
}
|
||||
if haystack.localizedCaseInsensitiveContains("Operation timed out")
|
||||
|| haystack.localizedCaseInsensitiveContains("Connection timed out")
|
||||
|| haystack.contains("Network is unreachable")
|
||||
|| haystack.contains("No route to host") {
|
||||
return "Couldn't reach the remote host — the network connection timed out. Check the host is running and your network is up."
|
||||
}
|
||||
if haystack.contains("Permission denied (publickey")
|
||||
|| haystack.contains("Permission denied, please try again") {
|
||||
return "SSH rejected the key. Make sure the right identity file is selected and that ssh-agent has the key loaded — open Terminal and run `ssh-add -l`."
|
||||
}
|
||||
if haystack.contains("Host key verification failed")
|
||||
|| haystack.contains("REMOTE HOST IDENTIFICATION HAS CHANGED") {
|
||||
return "The remote host's SSH key changed. If you just rebuilt the droplet, remove the old entry with `ssh-keygen -R <host>`, then try again."
|
||||
}
|
||||
if haystack.contains("Could not resolve hostname")
|
||||
|| haystack.contains("Name or service not known") {
|
||||
return "Couldn't resolve the host name. Check the host in this server's settings."
|
||||
}
|
||||
if haystack.localizedCaseInsensitiveContains("command not found")
|
||||
|| haystack.contains("hermes: not found")
|
||||
|| haystack.contains("exit 127") {
|
||||
return "The remote shell couldn't find `hermes`. Either install Hermes on the remote (`pipx install hermes-agent`) or set an absolute binary path in this server's settings."
|
||||
}
|
||||
|
||||
if haystack.range(of: #"No\s+(Anthropic|OpenAI|OpenRouter|Gemini|Google|Groq|Mistral|XAI)?\s*credentials\s+found"#,
|
||||
options: .regularExpression) != nil
|
||||
|| haystack.contains("ANTHROPIC_API_KEY")
|
||||
|
||||
@@ -36,6 +36,17 @@ public actor ProcessACPChannel: ACPChannel {
|
||||
private var readerTask: Task<Void, Never>?
|
||||
private var stderrTask: Task<Void, Never>?
|
||||
|
||||
/// Read by `ACPClient` to fill in `processTerminated(exitCode:…)`
|
||||
/// so the error names the actual exit code rather than reporting a
|
||||
/// bare timeout. Sourced directly from `Process` — `Process` is
|
||||
/// thread-safe for this read and reflects the actual reap state,
|
||||
/// so we sidestep the race between the OS-side `terminationHandler`
|
||||
/// callback and the EOF-driven disconnect cleanup that would
|
||||
/// otherwise need an atomic to coordinate.
|
||||
public var lastExitCode: Int32? {
|
||||
process.isRunning ? nil : process.terminationStatus
|
||||
}
|
||||
|
||||
/// The subprocess's PID as a human-readable string.
|
||||
public var diagnosticID: String? {
|
||||
"pid=\(process.processIdentifier)"
|
||||
@@ -58,7 +69,7 @@ public actor ProcessACPChannel: ACPChannel {
|
||||
proc.executableURL = URL(fileURLWithPath: executable)
|
||||
proc.arguments = args
|
||||
proc.environment = env
|
||||
try await Self.launch(process: proc, self_: nil)
|
||||
try await Self.launch(process: proc)
|
||||
try Self.ignoreSIGPIPE_once()
|
||||
|
||||
self.process = proc
|
||||
@@ -76,13 +87,14 @@ public actor ProcessACPChannel: ACPChannel {
|
||||
self.stderrContinuation = errContinuation
|
||||
|
||||
await startReaders()
|
||||
installTerminationHandler()
|
||||
}
|
||||
|
||||
/// Secondary entry point for callers that have a pre-configured
|
||||
/// `Process` (typically from `SSHTransport.makeProcess`). The process
|
||||
/// must NOT already be running — this initializer calls `run()`.
|
||||
public init(process: Process) async throws {
|
||||
try await Self.launch(process: process, self_: nil)
|
||||
try await Self.launch(process: process)
|
||||
try Self.ignoreSIGPIPE_once()
|
||||
|
||||
self.process = process
|
||||
@@ -100,14 +112,12 @@ public actor ProcessACPChannel: ACPChannel {
|
||||
self.stderrContinuation = errContinuation
|
||||
|
||||
await startReaders()
|
||||
installTerminationHandler()
|
||||
}
|
||||
|
||||
/// Wire fresh stdin/stdout/stderr pipes (overwriting any the caller
|
||||
/// set) and start the subprocess. `self_` is unused today — the
|
||||
/// placeholder keeps the signature ready for a future hook that
|
||||
/// captures termination in `proc.terminationHandler` and routes it
|
||||
/// into the channel's actor state.
|
||||
private static func launch(process: Process, self_: Any?) async throws {
|
||||
/// set) and start the subprocess.
|
||||
private static func launch(process: Process) async throws {
|
||||
process.standardInput = Pipe()
|
||||
process.standardOutput = Pipe()
|
||||
process.standardError = Pipe()
|
||||
@@ -118,6 +128,22 @@ public actor ProcessACPChannel: ACPChannel {
|
||||
}
|
||||
}
|
||||
|
||||
/// Install a `terminationHandler` that closes the stdout read end
|
||||
/// the moment the OS reaps the child. Without this, the reader
|
||||
/// loop's `availableData` keeps blocking until the kernel tears
|
||||
/// the pipe down on its own schedule — visible to the user as a
|
||||
/// 30s ACP `initialize` timeout where a fast SSH-side failure
|
||||
/// (Connection refused, exit 127) should surface in under a
|
||||
/// second. The exit code itself is read on demand from
|
||||
/// `Process.terminationStatus` (see `lastExitCode`), so this
|
||||
/// callback doesn't need to touch actor state.
|
||||
private func installTerminationHandler() {
|
||||
let stdoutFh = stdoutPipe.fileHandleForReading
|
||||
process.terminationHandler = { _ in
|
||||
try? stdoutFh.close()
|
||||
}
|
||||
}
|
||||
|
||||
/// Ignore SIGPIPE once per process so a broken-pipe write returns
|
||||
/// `EPIPE` (which we surface as `.writeEndClosed`) instead of
|
||||
/// delivering SIGPIPE and tearing the app down. Idempotent; the
|
||||
|
||||
@@ -425,14 +425,18 @@ public struct SSHTransport: ServerTransport {
|
||||
public func makeProcess(executable: String, args: [String]) -> Process {
|
||||
ensureControlDir()
|
||||
// `-T` disables pty allocation — critical for binary-clean stdin/stdout
|
||||
// (ACP JSON-RPC, log tail bytes). Same sh -c wrapping as runProcess
|
||||
// so home-relative paths in `executable`/`args` actually expand.
|
||||
// (ACP JSON-RPC, log tail bytes). `bash -lc` (login shell) sources the
|
||||
// user's profile so PATH picks up pipx's `~/.local/bin`, Homebrew on
|
||||
// Linux, asdf shims, and conda envs. Plain `sh -c` is non-login, so
|
||||
// pipx-installed `hermes` isn't on PATH unless `hermesBinaryHint` was
|
||||
// set explicitly — exactly the failure that surfaces as a
|
||||
// "command not found" / opaque init timeout against fresh droplets.
|
||||
let cmd = ([executable] + args).map { Self.remotePathArg($0) }.joined(separator: " ")
|
||||
var sshArgv = sshArgs()
|
||||
sshArgv.insert("-T", at: 0)
|
||||
sshArgv.append(hostSpec)
|
||||
sshArgv.append("sh")
|
||||
sshArgv.append("-c")
|
||||
sshArgv.append("bash")
|
||||
sshArgv.append("-lc")
|
||||
sshArgv.append(Self.shellQuote(cmd))
|
||||
let proc = Process()
|
||||
proc.executableURL = URL(fileURLWithPath: sshBinary)
|
||||
@@ -453,12 +457,17 @@ public struct SSHTransport: ServerTransport {
|
||||
return AsyncThrowingStream { continuation in
|
||||
Task.detached { [self] in
|
||||
ensureControlDir()
|
||||
// `bash -lc` (login shell) so PATH picks up profile-only
|
||||
// entries like pipx's `~/.local/bin` — same rationale as
|
||||
// `makeProcess` above. Streaming consumers (log tails)
|
||||
// don't tolerate a missing-binary failure any better than
|
||||
// ACP does.
|
||||
let cmd = ([executable] + args).map { Self.remotePathArg($0) }.joined(separator: " ")
|
||||
var sshArgv = sshArgs()
|
||||
sshArgv.insert("-T", at: 0)
|
||||
sshArgv.append(hostSpec)
|
||||
sshArgv.append("sh")
|
||||
sshArgv.append("-c")
|
||||
sshArgv.append("bash")
|
||||
sshArgv.append("-lc")
|
||||
sshArgv.append(Self.shellQuote(cmd))
|
||||
let proc = Process()
|
||||
proc.executableURL = URL(fileURLWithPath: sshBinary)
|
||||
|
||||
Reference in New Issue
Block a user