mirror of
https://github.com/awizemann/scarf.git
synced 2026-05-10 18:44:45 +00:00
fix(connection-pill): unify pill probe with diagnostics over raw ssh (#44)
Issue #44: pill stuck on "Connected — can't read Hermes state" while Run Diagnostics shows 14/14 passing. Both code paths probe the same question (`[ -r ~/.hermes/config.yaml ]`) yet disagreed. Root cause: the pill called `transport.runProcess(executable: "/bin/sh", args: ["-c", script])` which routes through SSHTransport.remotePathArg quoting. That quoting double-quotes every argument to rewrite `~/` → `$HOME/`, mangling multi-line shell scripts containing `"$VAR"` references and nested quotes — the remote received a scrambled `if`-test and `$H/config.yaml` evaluated to `"/config.yaml"` (or worse), so tier-2 always read as failed. `RemoteDiagnosticsViewModel` already documented this exact bug and worked around it locally: invoke `/usr/bin/ssh ... -- /bin/sh -s` directly and pipe the script via stdin so it travels as opaque bytes. The pill never got the same treatment, hence the silent disagreement. The #53 granular-cause script I added a few commits back made the mangling worse — more $VARs, more `[ ! -e ]` tests, more nested quoting, all things that increase the runProcess quoting attack surface. Move the diagnostics workaround into shared ScarfCore code as `SSHScriptRunner.run(script:context:timeout:)`. Both the pill probe and the diagnostics view now use it, so they always see the same remote shell state. macOS-only via `#if os(macOS)` (Foundation.Process isn't on iOS); iOS callers never reach this surface anyway — ScarfGo uses Citadel-based SSH transports for its own flows. Other tidy-ups: - `ConnectionStatusViewModel` no longer holds a `transport` instance — the field was only used by the now-replaced runProcess path. - `RemoteDiagnosticsViewModel` loses ~120 lines of duplicated `runOverSSH` / `runLocally` / `controlDirPath` helpers; calls into `SSHScriptRunner.run` directly. Risk: low. The SSH path is the same shape that's been shipping in the diagnostics view since #19. The pill's 15s heartbeat gains a small forking-an-ssh-process overhead vs the ControlMaster- multiplexed runProcess, which is invisible at that cadence and amortized by ssh's own ControlMaster (the `-o ControlMaster=auto` options match SSHTransport's, so the multiplex socket is shared). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,183 @@
|
||||
import Foundation
|
||||
|
||||
/// Runs multi-line shell scripts on a server (local or SSH) without
|
||||
/// going through `ServerTransport.runProcess`.
|
||||
///
|
||||
/// **Why this exists.** `SSHTransport.runProcess` quotes every argument
|
||||
/// via `remotePathArg` (it rewrites `~/` → `$HOME/`), which is correct
|
||||
/// for path arguments but mangles a multi-line script containing
|
||||
/// `"$VAR"` references, nested quotes, and control structures. The
|
||||
/// remote receives a scrambled string and the script silently
|
||||
/// produces no useful output.
|
||||
///
|
||||
/// `RemoteDiagnosticsViewModel` originally documented this and worked
|
||||
/// around it locally. Issue #44 surfaced the same bug for the
|
||||
/// connection-status pill (multi-line probe script through
|
||||
/// `runProcess` → tier 2 always reads as failed even when the file
|
||||
/// is readable, while diagnostics — which used the workaround —
|
||||
/// reports 14/14 passing). This helper centralises the workaround so
|
||||
/// any future caller running a script gets it for free.
|
||||
///
|
||||
/// **Approach.** We invoke `/usr/bin/ssh ... -- /bin/sh -s` directly
|
||||
/// and pipe the script via stdin, so the script travels as a single
|
||||
/// opaque byte stream that the remote shell parses unchanged. Local
|
||||
/// contexts skip ssh and just pipe to `/bin/sh -s` — same shape so
|
||||
/// callers can treat both uniformly.
|
||||
public enum SSHScriptRunner {
|
||||
|
||||
public enum Outcome: Sendable {
|
||||
/// Couldn't even reach the remote (process spawn failed,
|
||||
/// timeout before any output, network refused). Carries the
|
||||
/// human-readable reason.
|
||||
case connectFailure(String)
|
||||
/// Script ran to completion (or until timeout cut it short
|
||||
/// after producing partial output). Exit code, stdout, stderr
|
||||
/// are reported as captured.
|
||||
case completed(stdout: String, stderr: String, exitCode: Int32)
|
||||
}
|
||||
|
||||
/// Run `script` against the given context. Times out after
|
||||
/// `timeout` seconds, killing the subprocess if it overruns.
|
||||
///
|
||||
/// **Platforms.** Real implementation is macOS-only — relies on
|
||||
/// `Foundation.Process` which iOS doesn't ship. iOS callers
|
||||
/// (ScarfGo) use Citadel-backed SSH transports for their own
|
||||
/// flows; they never reach this entry point. To keep ScarfCore
|
||||
/// cross-platform we return a connect failure on non-macOS so
|
||||
/// the file compiles everywhere.
|
||||
public static func run(script: String, context: ServerContext, timeout: TimeInterval = 30) async -> Outcome {
|
||||
#if os(macOS)
|
||||
switch context.kind {
|
||||
case .local:
|
||||
return await runLocally(script: script, timeout: timeout)
|
||||
case .ssh(let config):
|
||||
return await runOverSSH(script: script, config: config, timeout: timeout)
|
||||
}
|
||||
#else
|
||||
return .connectFailure("SSHScriptRunner is only available on macOS")
|
||||
#endif
|
||||
}
|
||||
|
||||
// MARK: - SSH path
|
||||
|
||||
#if os(macOS)
|
||||
private static func runOverSSH(script: String, config: SSHConfig, timeout: TimeInterval) async -> Outcome {
|
||||
var sshArgv: [String] = [
|
||||
"-o", "ControlMaster=auto",
|
||||
"-o", "ControlPath=\(SSHTransport.controlDirPath())/%C",
|
||||
"-o", "ControlPersist=600",
|
||||
"-o", "ServerAliveInterval=30",
|
||||
"-o", "ConnectTimeout=10",
|
||||
"-o", "StrictHostKeyChecking=accept-new",
|
||||
"-o", "LogLevel=QUIET",
|
||||
"-o", "BatchMode=yes",
|
||||
"-T", // no pty — keep stdin/stdout a clean byte stream
|
||||
]
|
||||
if let port = config.port { sshArgv += ["-p", String(port)] }
|
||||
if let id = config.identityFile, !id.isEmpty {
|
||||
sshArgv += ["-i", id]
|
||||
}
|
||||
let hostSpec: String
|
||||
if let user = config.user, !user.isEmpty { hostSpec = "\(user)@\(config.host)" }
|
||||
else { hostSpec = config.host }
|
||||
sshArgv.append(hostSpec)
|
||||
sshArgv.append("--")
|
||||
sshArgv.append("/bin/sh")
|
||||
sshArgv.append("-s") // read script from stdin
|
||||
|
||||
return await Task.detached { () -> Outcome in
|
||||
let proc = Process()
|
||||
proc.executableURL = URL(fileURLWithPath: "/usr/bin/ssh")
|
||||
proc.arguments = sshArgv
|
||||
|
||||
// Inherit shell-derived SSH_AUTH_SOCK so ssh-agent reaches.
|
||||
// Same path SSHTransport uses internally — see
|
||||
// `environmentEnricher` set at app boot.
|
||||
var env = ProcessInfo.processInfo.environment
|
||||
if let enricher = SSHTransport.environmentEnricher {
|
||||
let shellEnv = enricher()
|
||||
for key in ["SSH_AUTH_SOCK", "SSH_AGENT_PID"] {
|
||||
if env[key] == nil, let v = shellEnv[key], !v.isEmpty {
|
||||
env[key] = v
|
||||
}
|
||||
}
|
||||
}
|
||||
proc.environment = env
|
||||
|
||||
let stdinPipe = Pipe()
|
||||
let stdoutPipe = Pipe()
|
||||
let stderrPipe = Pipe()
|
||||
proc.standardInput = stdinPipe
|
||||
proc.standardOutput = stdoutPipe
|
||||
proc.standardError = stderrPipe
|
||||
|
||||
do {
|
||||
try proc.run()
|
||||
} catch {
|
||||
return .connectFailure("Failed to launch ssh: \(error.localizedDescription)")
|
||||
}
|
||||
|
||||
if let data = script.data(using: .utf8) {
|
||||
try? stdinPipe.fileHandleForWriting.write(contentsOf: data)
|
||||
}
|
||||
try? stdinPipe.fileHandleForWriting.close()
|
||||
|
||||
let deadline = Date().addingTimeInterval(timeout)
|
||||
while proc.isRunning && Date() < deadline {
|
||||
try? await Task.sleep(nanoseconds: 100_000_000)
|
||||
}
|
||||
if proc.isRunning {
|
||||
proc.terminate()
|
||||
return .connectFailure("Script timed out after \(Int(timeout))s")
|
||||
}
|
||||
let out = (try? stdoutPipe.fileHandleForReading.readToEnd()) ?? Data()
|
||||
let err = (try? stderrPipe.fileHandleForReading.readToEnd()) ?? Data()
|
||||
// Best-effort fd close — Pipe leaks fd's otherwise.
|
||||
try? stdoutPipe.fileHandleForReading.close()
|
||||
try? stderrPipe.fileHandleForReading.close()
|
||||
return .completed(
|
||||
stdout: String(data: out, encoding: .utf8) ?? "",
|
||||
stderr: String(data: err, encoding: .utf8) ?? "",
|
||||
exitCode: proc.terminationStatus
|
||||
)
|
||||
}.value
|
||||
}
|
||||
|
||||
// MARK: - Local path
|
||||
|
||||
private static func runLocally(script: String, timeout: TimeInterval) async -> Outcome {
|
||||
return await Task.detached { () -> Outcome in
|
||||
let proc = Process()
|
||||
proc.executableURL = URL(fileURLWithPath: "/bin/sh")
|
||||
proc.arguments = ["-c", script]
|
||||
|
||||
let stdoutPipe = Pipe()
|
||||
let stderrPipe = Pipe()
|
||||
proc.standardOutput = stdoutPipe
|
||||
proc.standardError = stderrPipe
|
||||
do {
|
||||
try proc.run()
|
||||
} catch {
|
||||
return .connectFailure("Failed to launch /bin/sh: \(error.localizedDescription)")
|
||||
}
|
||||
let deadline = Date().addingTimeInterval(timeout)
|
||||
while proc.isRunning && Date() < deadline {
|
||||
try? await Task.sleep(nanoseconds: 100_000_000)
|
||||
}
|
||||
if proc.isRunning {
|
||||
proc.terminate()
|
||||
return .connectFailure("Script timed out after \(Int(timeout))s")
|
||||
}
|
||||
let out = (try? stdoutPipe.fileHandleForReading.readToEnd()) ?? Data()
|
||||
let err = (try? stderrPipe.fileHandleForReading.readToEnd()) ?? Data()
|
||||
try? stdoutPipe.fileHandleForReading.close()
|
||||
try? stderrPipe.fileHandleForReading.close()
|
||||
return .completed(
|
||||
stdout: String(data: out, encoding: .utf8) ?? "",
|
||||
stderr: String(data: err, encoding: .utf8) ?? "",
|
||||
exitCode: proc.terminationStatus
|
||||
)
|
||||
}.value
|
||||
}
|
||||
#endif // os(macOS)
|
||||
}
|
||||
+20
-23
@@ -70,12 +70,10 @@ public final class ConnectionStatusViewModel {
|
||||
private let consecutiveFailureThreshold = 2
|
||||
|
||||
public let context: ServerContext
|
||||
private let transport: any ServerTransport
|
||||
private var probeTask: Task<Void, Never>?
|
||||
|
||||
public init(context: ServerContext) {
|
||||
self.context = context
|
||||
self.transport = context.makeTransport()
|
||||
if !context.isRemote {
|
||||
// Local contexts are always considered connected — no network
|
||||
// or auth can fail.
|
||||
@@ -108,7 +106,7 @@ public final class ConnectionStatusViewModel {
|
||||
}
|
||||
|
||||
private func probeOnce() async {
|
||||
let snapshot = transport
|
||||
let snapshot = context
|
||||
let hermesHome = context.paths.home
|
||||
// Two-tier probe in one SSH round-trip:
|
||||
// tier 1: `true` — raw connectivity / auth / ControlMaster path
|
||||
@@ -162,39 +160,38 @@ public final class ConnectionStatusViewModel {
|
||||
case failure(TransportError)
|
||||
}
|
||||
|
||||
let outcome: ProbeOutcome = await Task.detached {
|
||||
do {
|
||||
let probe = try snapshot.runProcess(
|
||||
executable: "/bin/sh",
|
||||
args: ["-c", script],
|
||||
stdin: nil,
|
||||
timeout: 10
|
||||
)
|
||||
guard probe.exitCode == 0 else {
|
||||
return .failure(.commandFailed(exitCode: probe.exitCode, stderr: probe.stderrString))
|
||||
// Issue #44: previously this used `transport.runProcess(executable:
|
||||
// "/bin/sh", args: ["-c", script])`, which goes through
|
||||
// SSHTransport's `remotePathArg` quoting. That mangles multi-line
|
||||
// shell scripts containing `"$VAR"` references and nested
|
||||
// quotes — the remote received a scrambled string and the if-test
|
||||
// for config.yaml readability silently failed even when the file
|
||||
// was readable. Result: 14/14 diagnostics passing AND a stuck
|
||||
// "Connected — can't read Hermes state" pill, simultaneously,
|
||||
// because diagnostics had its own runOverSSH workaround. Now
|
||||
// both paths use SSHScriptRunner so they always agree.
|
||||
let outcome: ProbeOutcome = await {
|
||||
let result = await SSHScriptRunner.run(script: script, context: snapshot, timeout: 10)
|
||||
switch result {
|
||||
case .connectFailure(let msg):
|
||||
return .failure(.other(message: msg))
|
||||
case .completed(let out, let stderr, let exitCode):
|
||||
guard exitCode == 0 else {
|
||||
return .failure(.commandFailed(exitCode: exitCode, stderr: stderr))
|
||||
}
|
||||
let out = probe.stdoutString
|
||||
let tier1 = out.contains("TIER1:0")
|
||||
let tier2 = out.contains("TIER2:0")
|
||||
if !tier1 {
|
||||
// The script itself didn't reach tier 1 — treat as connection failure.
|
||||
return .failure(.commandFailed(exitCode: 1, stderr: out))
|
||||
}
|
||||
if tier2 {
|
||||
return .connected
|
||||
}
|
||||
// Connected but tier 2 failed. Parse the granular cause
|
||||
// code; older remotes that don't emit a tag fall through
|
||||
// to `.unknown` with a generic hint (issue #53).
|
||||
let cause = Self.parseDegradedCause(stdout: out)
|
||||
let (reason, hint) = Self.describe(cause: cause, hermesHome: hermesHome)
|
||||
return .degraded(reason: reason, hint: hint, cause: cause)
|
||||
} catch let e as TransportError {
|
||||
return .failure(e)
|
||||
} catch {
|
||||
return .failure(.other(message: error.localizedDescription))
|
||||
}
|
||||
}.value
|
||||
}()
|
||||
|
||||
switch outcome {
|
||||
case .connected:
|
||||
|
||||
Reference in New Issue
Block a user