From de36411a8d110c46f4daea4b49377f76408eaa1f Mon Sep 17 00:00:00 2001
From: Alan Wizemann <alan@wizemann.com>
Date: Mon, 4 May 2026 11:25:38 +0200
Subject: [PATCH] fix(remote): size-aware snapshot timeouts and partial-file
 cleanup (#74)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The remote-DB snapshot pipeline was hardcoded to a 120s scp timeout and
a 60s remote-backup timeout. For users with a multi-GB state.db (the
report cites 4.87 GB), 120s is wildly insufficient — at typical home
upload speeds (5-50 Mbps) a 5GB transfer takes 13 minutes to several
hours. scp gets killed mid-transfer, leaves a partially-written .db at
the cache path, and every subsequent attempt opens that corrupt file
with sqlite_open returning garbage. Symptom: SSH connects, all
diagnostics pass, but Dashboard / Sessions / Memory show no data.

Changes to SSHTransport.snapshotSQLite:

* Probe `stat` on the remote DB before starting. Drives both the
  timeout budget and a local-disk-space pre-flight (refuses to start
  if local Caches volume can't hold size + 500MB margin).
* Adaptive timeouts based on remote size:
  - backup: 60s base + 1s per 100MB, capped at 600s.
  - scp:    300s base + 0.5s per MB (≈2 MB/s minimum throughput),
            capped at 3600s.
  Defaults of 60s/300s when stat fails (still up from 120s on scp).
* Add `-C` to scp args. SQLite DBs have lots of zero-padded empty
  pages and typically compress 30-50% in transit.
* On any failure path, remove the partial local snapshot file so the
  next attempt starts fresh instead of opening a corrupt DB.
* Rewrite the generic "Command timed out after Ns" error into a
  specific "Snapshot transfer timed out after Ns pulling X.X GB
  state.db from <host>" so users on slow links know what hit the
  wall instead of seeing a meaningless number.

Cannot reproduce locally (no 5GB state.db on hand), but the failure
mode is unambiguous from code reading: hardcoded 120s vs. real-world
multi-GB transfer durations.

Closes #74

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../ScarfCore/Transport/SSHTransport.swift    | 91 +++++++++++++++++--
 1 file changed, 85 insertions(+), 6 deletions(-)
diff --git a/scarf/Packages/ScarfCore/Sources/ScarfCore/Transport/SSHTransport.swift b/scarf/Packages/ScarfCore/Sources/ScarfCore/Transport/SSHTransport.swift
index 77d2244..254448f 100644
--- a/scarf/Packages/ScarfCore/Sources/ScarfCore/Transport/SSHTransport.swift
+++ b/scarf/Packages/ScarfCore/Sources/ScarfCore/Transport/SSHTransport.swift
@@ -625,6 +625,44 @@ public struct SSHTransport: ServerTransport {
     public func snapshotSQLite(remotePath: String) throws -> URL {
         try? FileManager.default.createDirectory(atPath: snapshotDir, withIntermediateDirectories: true)
         let localPath = snapshotDir + "/state.db"
+
+        // Probe remote size up front. Drives both the timeout budget
+        // (a multi-GB state.db over a slow link can take many minutes —
+        // the historical hardcoded 120s scp timeout was wildly
+        // insufficient for users with 5GB+ DBs, issue #74) and a local-
+        // disk-space pre-flight so we don't fill the user's Mac
+        // mid-transfer. Falls back to base timeouts if stat fails.
+        let remoteSize = stat(remotePath)?.size ?? 0
+
+        // Pre-flight: refuse to start if local Caches volume can't hold
+        // the snapshot plus a 500MB safety margin. Better to fail
+        // up-front with a clear "out of disk" message than to 90%-fill
+        // the volume and crash mid-transfer.
+        if remoteSize > 0,
+           let attrs = try? FileManager.default.attributesOfFileSystem(forPath: snapshotDir),
+           let free = (attrs[.systemFreeSize] as? NSNumber)?.int64Value,
+           free < remoteSize + 500_000_000 {
+            throw TransportError.fileIO(
+                path: localPath,
+                underlying: "Insufficient local disk space: state.db is \(Self.formatBytes(remoteSize)), only \(Self.formatBytes(free)) free in \(snapshotDir)."
+            )
+        }
+
+        // Adaptive timeouts. `.backup` is sequential page copy at ~100MB/s
+        // on a typical SSD, but the resulting file can be huge — give it
+        // 60s base + 1s per 100MB, capped at 10 minutes. SCP is the real
+        // bottleneck: 300s base + 0.5s per MB (≈2 MB/s minimum throughput,
+        // which covers users on slow international links), capped at 1
+        // hour. A user with a state.db so big it doesn't fit in 1h needs
+        // a different approach than Scarf can offer (rsync delta, mounted
+        // FS, etc.).
+        let backupTimeout: TimeInterval = remoteSize > 0
+            ? min(600, 60 + Double(remoteSize) / 100_000_000)
+            : 60
+        let scpTimeout: TimeInterval = remoteSize > 0
+            ? min(3600, 300 + Double(remoteSize) / 2_000_000)
+            : 300
+
         // `.backup` is WAL-safe: sqlite takes a consistent snapshot without
         // blocking writers. A plain `cp` of a WAL-mode DB could corrupt.
         let remoteTmp = "/tmp/scarf-snapshot-\(UUID().uuidString).db"
@@ -646,15 +684,19 @@ public struct SSHTransport: ServerTransport {
         //   sqlite3 "$HOME/.hermes/state.db" ".backup '/tmp/scarf-snapshot-XYZ.db'" \
         //     && sqlite3 '/tmp/scarf-snapshot-XYZ.db' "PRAGMA journal_mode=DELETE;"
         let backupScript = #"sqlite3 \#(Self.remotePathArg(remotePath)) ".backup '\#(remoteTmp)'" && sqlite3 '\#(remoteTmp)' "PRAGMA journal_mode=DELETE;" > /dev/null"#
-        let backup = try runRemoteShell(backupScript)
+        let backup = try runRemoteShell(backupScript, timeout: backupTimeout)
         if backup.exitCode != 0 {
             throw TransportError.classifySSHFailure(host: config.host, exitCode: backup.exitCode, stderr: backup.stderrString)
         }
         // scp the backup down. scp/sftp expands `~` natively (it goes
         // through the SSH file-transfer protocol, not a remote shell), so
         // remoteTmp's `/tmp/...` absolute path round-trips as-is.
+        // `-C` enables gzip compression in transit; SQLite DBs typically
+        // have lots of empty pages and zero padding so the wire savings
+        // are 30-50% in practice.
         ensureControlDir()
         var scpArgs: [String] = [
+            "-C",
             "-o", "ControlMaster=auto",
             "-o", "ControlPath=\(controlDir)/%C",
             "-o", "ControlPersist=600",
@@ -666,15 +708,52 @@ public struct SSHTransport: ServerTransport {
         if let id = config.identityFile, !id.isEmpty { scpArgs += ["-i", id] }
         scpArgs.append("\(hostSpec):\(remoteTmp)")
         scpArgs.append(localPath)
-        let pull = try runLocal(executable: scpBinary, args: scpArgs, stdin: nil, timeout: 120)
-        // Regardless of pull outcome, try to clean up the remote tmp.
-        _ = try? runRemoteShell("rm -f \(Self.remotePathArg(remoteTmp))")
-        if pull.exitCode != 0 {
-            throw TransportError.classifySSHFailure(host: config.host, exitCode: pull.exitCode, stderr: pull.stderrString)
+
+        do {
+            let pull = try runLocal(executable: scpBinary, args: scpArgs, stdin: nil, timeout: scpTimeout)
+            // Best-effort cleanup of remote tmp regardless of outcome.
+            _ = try? runRemoteShell("rm -f \(Self.remotePathArg(remoteTmp))")
+            if pull.exitCode != 0 {
+                // Wipe the partial local file so a subsequent attempt
+                // doesn't try to open a corrupted SQLite database.
+                // Otherwise scp's truncate-on-write semantics leave a
+                // smaller-than-expected `.db` that sqlite_open succeeds
+                // on but every read returns garbage from.
+                try? FileManager.default.removeItem(atPath: localPath)
+                throw TransportError.classifySSHFailure(host: config.host, exitCode: pull.exitCode, stderr: pull.stderrString)
+            }
+        } catch let error as TransportError {
+            _ = try? runRemoteShell("rm -f \(Self.remotePathArg(remoteTmp))")
+            try? FileManager.default.removeItem(atPath: localPath)
+            // Rewrite "Command timed out after Ns" into something useful
+            // when it was the snapshot pull that hit the wall — generic
+            // timeout message gives the user no clue that the cause was
+            // a 5GB DB on a slow link.
+            if case .timeout(let secs, _) = error, remoteSize > 0 {
+                throw TransportError.other(
+                    message: "Snapshot transfer timed out after \(Int(secs))s pulling \(Self.formatBytes(remoteSize)) state.db from \(config.host). Try again on a faster network connection, or reduce the size of the remote state.db."
+                )
+            }
+            throw error
+        } catch {
+            _ = try? runRemoteShell("rm -f \(Self.remotePathArg(remoteTmp))")
+            try? FileManager.default.removeItem(atPath: localPath)
+            throw error
         }
+
         return URL(fileURLWithPath: localPath)
     }
 
+    /// Human-readable byte count for snapshot-pipeline error messages.
+    /// Wraps `ByteCountFormatter` for callers that just want
+    /// `"4.87 GB"` — used in the size-aware timeout / disk-space
+    /// errors emitted by `snapshotSQLite`.
+    nonisolated private static func formatBytes(_ bytes: Int64) -> String {
+        let formatter = ByteCountFormatter()
+        formatter.countStyle = .file
+        return formatter.string(fromByteCount: bytes)
+    }
+
     /// Path where the most recent successful snapshot was written —
     /// returned even when the remote is currently unreachable. The
     /// data service falls back to this when `snapshotSQLite` throws so