From a6a8cae8ff823382b8a5ecd9913264f8ed3f0354 Mon Sep 17 00:00:00 2001
From: Alan Wizemann <alan@wizemann.com>
Date: Thu, 7 May 2026 11:50:34 +0200
Subject: [PATCH] fix(transport): drain ssh stdout/stderr concurrently to
 unwedge >64KB payloads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issue #77 — Sessions screen rendered empty even though Dashboard
reported 161 sessions and Activity reported 116. Root cause was a
classic pipe-buffer deadlock in SSHScriptRunner: stdout was read via
`readToEnd()` AFTER the subprocess had exited. macOS pipes default to
a 16–64 KB kernel buffer; once the remote `sqlite3 -json` script wrote
more than that to its stdout, ssh back-pressured across the wire,
sshd back-pressured sqlite3, sqlite3 blocked, the script never
finished, the 30-second timeout fired, `streamScript` threw, and
`HermesDataService.sessionListSnapshot()` swallowed the failure into
an empty array. Empty Sessions list. Dashboard kept working because
its smaller LIMIT 5 payload fit under the threshold.

Why this was a v2.7 regression specifically: 20cc3a2 folded the
previously-separate sessions + previews queries into a single batched
round-trip (perf win for remote users). The new combined payload for
~150+ sessions crossed the buffer threshold for the first time.

Fix: drain stdout/stderr concurrently with the running process via
Foundation's `FileHandle.readabilityHandler`, accumulating chunks
into an NSLock-guarded `Data` buffer. The kernel pipe never fills,
the subprocess never blocks, the script returns the full payload.
Same change applied to both the SSH path (`runOverSSH`) and the
local path (`runLocally`) — they had identical bug shapes.

Adds SSHScriptRunnerTests with three regression checks: a 256 KB
synthetic payload that would have wedged pre-fix, a small-payload
sanity round-trip, and a non-zero exit propagation check.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../ScarfCore/Transport/SSHScriptRunner.swift | 104 +++++++++++++++++-
 .../ScarfCoreTests/SSHScriptRunnerTests.swift |  85 ++++++++++++++
 2 files changed, 185 insertions(+), 4 deletions(-)
 create mode 100644 scarf/Packages/ScarfCore/Tests/ScarfCoreTests/SSHScriptRunnerTests.swift

diff --git a/scarf/Packages/ScarfCore/Sources/ScarfCore/Transport/SSHScriptRunner.swift b/scarf/Packages/ScarfCore/Sources/ScarfCore/Transport/SSHScriptRunner.swift
index af5010e..0dbabee 100644
--- a/scarf/Packages/ScarfCore/Sources/ScarfCore/Transport/SSHScriptRunner.swift
+++ b/scarf/Packages/ScarfCore/Sources/ScarfCore/Transport/SSHScriptRunner.swift
@@ -49,6 +49,34 @@ public enum SSHScriptRunner {
         }
     }
 
+    /// Lock-protected `Data` accumulator used by the stdout/stderr
+    /// readability handlers below. Two of these per script run, one per
+    /// stream. `@unchecked Sendable` because mutation goes through the
+    /// `NSLock` — Swift can't see that.
+    ///
+    /// Why this exists (issue #77): the previous implementation read
+    /// stdout/stderr via `readToEnd()` *after* the subprocess exited.
+    /// On macOS pipes default to a 16–64 KB kernel buffer; once
+    /// `sqlite3 -json` writes more than that, the SSH client back-
+    /// pressures over the wire, the remote sqlite3 blocks, the script
+    /// never finishes, the 30 s timeout fires, and the caller sees
+    /// "Script timed out" + an empty result set. v2.7's
+    /// `sessionListSnapshot(limit: 500)` crossed that threshold for
+    /// any user with ~150+ sessions. Draining concurrently with
+    /// `readabilityHandler` removes the back-pressure.
+    private final class LockedData: @unchecked Sendable {
+        private let lock = NSLock()
+        private var buf = Data()
+        func append(_ chunk: Data) {
+            lock.lock(); defer { lock.unlock() }
+            buf.append(chunk)
+        }
+        func snapshot() -> Data {
+            lock.lock(); defer { lock.unlock() }
+            return buf
+        }
+    }
+
     public enum Outcome: Sendable {
         /// Couldn't even reach the remote (process spawn failed,
         /// timeout before any output, network refused). Carries the
@@ -151,9 +179,35 @@ public enum SSHScriptRunner {
             proc.standardOutput = stdoutPipe
             proc.standardError = stderrPipe
 
+            // Drain stdout/stderr concurrently with the running process —
+            // see the LockedData docstring above for the issue-#77
+            // back-story. Without these handlers a >64 KB script output
+            // wedges the pipe + ssh + remote sqlite3 chain and the only
+            // visible symptom is a timeout.
+            let outBuf = LockedData()
+            let errBuf = LockedData()
+            stdoutPipe.fileHandleForReading.readabilityHandler = { handle in
+                let chunk = handle.availableData
+                if chunk.isEmpty {
+                    handle.readabilityHandler = nil
+                } else {
+                    outBuf.append(chunk)
+                }
+            }
+            stderrPipe.fileHandleForReading.readabilityHandler = { handle in
+                let chunk = handle.availableData
+                if chunk.isEmpty {
+                    handle.readabilityHandler = nil
+                } else {
+                    errBuf.append(chunk)
+                }
+            }
+
             do {
                 try proc.run()
             } catch {
+                stdoutPipe.fileHandleForReading.readabilityHandler = nil
+                stderrPipe.fileHandleForReading.readabilityHandler = nil
                 return .connectFailure("Failed to launch ssh: \(error.localizedDescription)")
             }
 
@@ -172,6 +226,8 @@ public enum SSHScriptRunner {
                 // belt-and-suspenders.
                 if cancelFlag.isCancelled || Task.isCancelled {
                     proc.terminate()
+                    stdoutPipe.fileHandleForReading.readabilityHandler = nil
+                    stderrPipe.fileHandleForReading.readabilityHandler = nil
                     try? stdoutPipe.fileHandleForReading.close()
                     try? stderrPipe.fileHandleForReading.close()
                     return .connectFailure("Script cancelled")
@@ -180,6 +236,8 @@ public enum SSHScriptRunner {
             }
             if proc.isRunning {
                 proc.terminate()
+                stdoutPipe.fileHandleForReading.readabilityHandler = nil
+                stderrPipe.fileHandleForReading.readabilityHandler = nil
                 // Pipe fds leak otherwise — closing on the timeout branch
                 // matches the success-path discipline (see CLAUDE.md
                 // "Always close both fileHandleForReading and
@@ -188,8 +246,14 @@ public enum SSHScriptRunner {
                 try? stderrPipe.fileHandleForReading.close()
                 return .connectFailure("Script timed out after \(Int(timeout))s")
             }
-            let out = (try? stdoutPipe.fileHandleForReading.readToEnd()) ?? Data()
-            let err = (try? stderrPipe.fileHandleForReading.readToEnd()) ?? Data()
+            // Detach the readabilityHandlers and capture whatever the
+            // accumulator has. The handler may have already seen EOF
+            // (`chunk.isEmpty`) and self-cleared, but assigning nil is
+            // idempotent and guards against a late tick from the queue.
+            stdoutPipe.fileHandleForReading.readabilityHandler = nil
+            stderrPipe.fileHandleForReading.readabilityHandler = nil
+            let out = outBuf.snapshot()
+            let err = errBuf.snapshot()
             // Best-effort fd close — Pipe leaks fd's otherwise.
             try? stdoutPipe.fileHandleForReading.close()
             try? stderrPipe.fileHandleForReading.close()
@@ -213,15 +277,43 @@ public enum SSHScriptRunner {
             let stderrPipe = Pipe()
             proc.standardOutput = stdoutPipe
             proc.standardError = stderrPipe
+
+            // Drain concurrently — same pipe-buffer fix as runOverSSH.
+            // Local scripts can also blow past the 16–64 KB pipe buffer
+            // (e.g. local `sqlite3 -json` over a fat result set) and
+            // would wedge in exactly the same way.
+            let outBuf = LockedData()
+            let errBuf = LockedData()
+            stdoutPipe.fileHandleForReading.readabilityHandler = { handle in
+                let chunk = handle.availableData
+                if chunk.isEmpty {
+                    handle.readabilityHandler = nil
+                } else {
+                    outBuf.append(chunk)
+                }
+            }
+            stderrPipe.fileHandleForReading.readabilityHandler = { handle in
+                let chunk = handle.availableData
+                if chunk.isEmpty {
+                    handle.readabilityHandler = nil
+                } else {
+                    errBuf.append(chunk)
+                }
+            }
+
             do {
                 try proc.run()
             } catch {
+                stdoutPipe.fileHandleForReading.readabilityHandler = nil
+                stderrPipe.fileHandleForReading.readabilityHandler = nil
                 return .connectFailure("Failed to launch /bin/sh: \(error.localizedDescription)")
             }
             let deadline = Date().addingTimeInterval(timeout)
             while proc.isRunning && Date() < deadline {
                 if cancelFlag.isCancelled || Task.isCancelled {
                     proc.terminate()
+                    stdoutPipe.fileHandleForReading.readabilityHandler = nil
+                    stderrPipe.fileHandleForReading.readabilityHandler = nil
                     try? stdoutPipe.fileHandleForReading.close()
                     try? stderrPipe.fileHandleForReading.close()
                     return .connectFailure("Script cancelled")
@@ -230,12 +322,16 @@ public enum SSHScriptRunner {
             }
             if proc.isRunning {
                 proc.terminate()
+                stdoutPipe.fileHandleForReading.readabilityHandler = nil
+                stderrPipe.fileHandleForReading.readabilityHandler = nil
                 try? stdoutPipe.fileHandleForReading.close()
                 try? stderrPipe.fileHandleForReading.close()
                 return .connectFailure("Script timed out after \(Int(timeout))s")
             }
-            let out = (try? stdoutPipe.fileHandleForReading.readToEnd()) ?? Data()
-            let err = (try? stderrPipe.fileHandleForReading.readToEnd()) ?? Data()
+            stdoutPipe.fileHandleForReading.readabilityHandler = nil
+            stderrPipe.fileHandleForReading.readabilityHandler = nil
+            let out = outBuf.snapshot()
+            let err = errBuf.snapshot()
             try? stdoutPipe.fileHandleForReading.close()
             try? stderrPipe.fileHandleForReading.close()
             return .completed(
diff --git a/scarf/Packages/ScarfCore/Tests/ScarfCoreTests/SSHScriptRunnerTests.swift b/scarf/Packages/ScarfCore/Tests/ScarfCoreTests/SSHScriptRunnerTests.swift
new file mode 100644
index 0000000..c2d997a
--- /dev/null
+++ b/scarf/Packages/ScarfCore/Tests/ScarfCoreTests/SSHScriptRunnerTests.swift
@@ -0,0 +1,85 @@
+import Testing
+import Foundation
+@testable import ScarfCore
+
+/// Regression tests for `SSHScriptRunner`. Mac-only because the
+/// implementation relies on `Foundation.Process`, which doesn't exist
+/// on Swift Linux. Drives the `runLocally` path so we don't need an
+/// SSH endpoint in CI.
+#if os(macOS)
+@Suite struct SSHScriptRunnerTests {
+
+    /// Issue #77 regression. Pre-fix the runner read stdout via
+    /// `readToEnd()` *after* the subprocess exited; once the script's
+    /// output crossed the kernel's pipe buffer (16–64 KB on macOS) the
+    /// process wedged because nothing was draining the read end. The
+    /// only visible symptom was a 30-second timeout and an empty
+    /// result.
+    ///
+    /// This script writes ~256 KB of bytes — comfortably past every
+    /// pipe-buffer threshold. With the readabilityHandler drain in
+    /// place the run should complete in well under a second and
+    /// return the full payload.
+    @Test func drainsLargeStdoutWithoutTimeout() async throws {
+        // 256 lines × 1024 bytes/line = 256 KB.
+        let script = """
+        for i in $(seq 1 256); do
+            printf '%04d:' "$i"
+            printf '%.0sx' $(seq 1 1018)
+            printf '\\n'
+        done
+        """
+        let outcome = await SSHScriptRunner.run(
+            script: script,
+            context: .local,
+            timeout: 10
+        )
+        switch outcome {
+        case .completed(let stdout, _, let exitCode):
+            #expect(exitCode == 0)
+            // 256 lines + final newline.
+            let lines = stdout.split(separator: "\n", omittingEmptySubsequences: false)
+            #expect(lines.count >= 256)
+            #expect(stdout.utf8.count >= 256 * 1024)
+        case .connectFailure(let reason):
+            Issue.record("Expected completion, got connectFailure: \(reason)")
+        }
+    }
+
+    /// Sanity check that small scripts still come back the way they
+    /// did before the drain refactor. Guards against an off-by-one in
+    /// the readability handler that swallowed trailing bytes.
+    @Test func smallScriptPayloadRoundTrips() async throws {
+        let outcome = await SSHScriptRunner.run(
+            script: "printf 'hello\\n' && printf 'world\\n' >&2 && exit 0",
+            context: .local,
+            timeout: 5
+        )
+        switch outcome {
+        case .completed(let stdout, let stderr, let exitCode):
+            #expect(exitCode == 0)
+            #expect(stdout == "hello\n")
+            #expect(stderr == "world\n")
+        case .connectFailure(let reason):
+            Issue.record("Expected completion, got connectFailure: \(reason)")
+        }
+    }
+
+    /// Non-zero exit codes should still be reported as `.completed`
+    /// with the captured stdout/stderr — unchanged contract.
+    @Test func nonZeroExitIsReportedAsCompleted() async throws {
+        let outcome = await SSHScriptRunner.run(
+            script: "echo nope >&2 && exit 7",
+            context: .local,
+            timeout: 5
+        )
+        switch outcome {
+        case .completed(_, let stderr, let exitCode):
+            #expect(exitCode == 7)
+            #expect(stderr.contains("nope"))
+        case .connectFailure(let reason):
+            Issue.record("Expected completion, got connectFailure: \(reason)")
+        }
+    }
+}
+#endif