fix(remote): size-aware snapshot timeouts and partial-file cleanup (#74)

The remote-DB snapshot pipeline was hardcoded to a 120s scp timeout and a 60s remote-backup timeout. For users with a multi-GB state.db (the report cites 4.87 GB), 120s is wildly insufficient — at typical home upload speeds (5-50 Mbps) a 5GB transfer takes 13 minutes to several hours. scp gets killed mid-transfer, leaves a partially-written .db at the cache path, and every subsequent attempt opens that corrupt file with sqlite_open returning garbage. Symptom: SSH connects, all diagnostics pass, but Dashboard / Sessions / Memory show no data. Changes to SSHTransport.snapshotSQLite: * Probe `stat` on the remote DB before starting. Drives both the timeout budget and a local-disk-space pre-flight (refuses to start if local Caches volume can't hold size + 500MB margin). * Adaptive timeouts based on remote size: - backup: 60s base + 1s per 100MB, capped at 600s. - scp: 300s base + 0.5s per MB (≈2 MB/s minimum throughput), capped at 3600s. Defaults of 60s/300s when stat fails (still up from 120s on scp). * Add `-C` to scp args. SQLite DBs have lots of zero-padded empty pages and typically compress 30-50% in transit. * On any failure path, remove the partial local snapshot file so the next attempt starts fresh instead of opening a corrupt DB. * Rewrite the generic "Command timed out after Ns" error into a specific "Snapshot transfer timed out after Ns pulling X.X GB state.db from <host>" so users on slow links know what hit the wall instead of seeing a meaningless number. Cannot reproduce locally (no 5GB state.db on hand), but the failure mode is unambiguous from code reading: hardcoded 120s vs. real-world multi-GB transfer durations. Closes #74 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 02:14:37 +00:00 · 2026-05-04 11:25:38 +02:00
parent 6a7ac21ebe
commit de36411a8d
1 changed files with 85 additions and 6 deletions
@@ -625,6 +625,44 @@ public struct SSHTransport: ServerTransport {
    public func snapshotSQLite(remotePath: String) throws -> URL {
        try? FileManager.default.createDirectory(atPath: snapshotDir, withIntermediateDirectories: true)
        let localPath = snapshotDir + "/state.db"
+
+        // Probe remote size up front. Drives both the timeout budget
+        // (a multi-GB state.db over a slow link can take many minutes —
+        // the historical hardcoded 120s scp timeout was wildly
+        // insufficient for users with 5GB+ DBs, issue #74) and a local-
+        // disk-space pre-flight so we don't fill the user's Mac
+        // mid-transfer. Falls back to base timeouts if stat fails.
+        let remoteSize = stat(remotePath)?.size ?? 0
+
+        // Pre-flight: refuse to start if local Caches volume can't hold
+        // the snapshot plus a 500MB safety margin. Better to fail
+        // up-front with a clear "out of disk" message than to 90%-fill
+        // the volume and crash mid-transfer.
+        if remoteSize > 0,
+           let attrs = try? FileManager.default.attributesOfFileSystem(forPath: snapshotDir),
+           let free = (attrs[.systemFreeSize] as? NSNumber)?.int64Value,
+           free < remoteSize + 500_000_000 {
+            throw TransportError.fileIO(
+                path: localPath,
+                underlying: "Insufficient local disk space: state.db is \(Self.formatBytes(remoteSize)), only \(Self.formatBytes(free)) free in \(snapshotDir)."
+            )
+        }
+
+        // Adaptive timeouts. `.backup` is sequential page copy at ~100MB/s
+        // on a typical SSD, but the resulting file can be huge — give it
+        // 60s base + 1s per 100MB, capped at 10 minutes. SCP is the real
+        // bottleneck: 300s base + 0.5s per MB (≈2 MB/s minimum throughput,
+        // which covers users on slow international links), capped at 1
+        // hour. A user with a state.db so big it doesn't fit in 1h needs
+        // a different approach than Scarf can offer (rsync delta, mounted
+        // FS, etc.).
+        let backupTimeout: TimeInterval = remoteSize > 0
+            ? min(600, 60 + Double(remoteSize) / 100_000_000)
+            : 60
+        let scpTimeout: TimeInterval = remoteSize > 0
+            ? min(3600, 300 + Double(remoteSize) / 2_000_000)
+            : 300
+
        // `.backup` is WAL-safe: sqlite takes a consistent snapshot without
        // blocking writers. A plain `cp` of a WAL-mode DB could corrupt.
        let remoteTmp = "/tmp/scarf-snapshot-\(UUID().uuidString).db"
@@ -646,15 +684,19 @@ public struct SSHTransport: ServerTransport {
        //   sqlite3 "$HOME/.hermes/state.db" ".backup '/tmp/scarf-snapshot-XYZ.db'" \
        //     && sqlite3 '/tmp/scarf-snapshot-XYZ.db' "PRAGMA journal_mode=DELETE;"
        let backupScript = #"sqlite3 \#(Self.remotePathArg(remotePath)) ".backup '\#(remoteTmp)'" && sqlite3 '\#(remoteTmp)' "PRAGMA journal_mode=DELETE;" > /dev/null"#
-        let backup = try runRemoteShell(backupScript)
+        let backup = try runRemoteShell(backupScript, timeout: backupTimeout)
        if backup.exitCode != 0 {
            throw TransportError.classifySSHFailure(host: config.host, exitCode: backup.exitCode, stderr: backup.stderrString)
        }
        // scp the backup down. scp/sftp expands `~` natively (it goes
        // through the SSH file-transfer protocol, not a remote shell), so
        // remoteTmp's `/tmp/...` absolute path round-trips as-is.
+        // `-C` enables gzip compression in transit; SQLite DBs typically
+        // have lots of empty pages and zero padding so the wire savings
+        // are 30-50% in practice.
        ensureControlDir()
        var scpArgs: [String] = [
+            "-C",
            "-o", "ControlMaster=auto",
            "-o", "ControlPath=\(controlDir)/%C",
            "-o", "ControlPersist=600",
@@ -666,15 +708,52 @@ public struct SSHTransport: ServerTransport {
        if let id = config.identityFile, !id.isEmpty { scpArgs += ["-i", id] }
        scpArgs.append("\(hostSpec):\(remoteTmp)")
        scpArgs.append(localPath)
-        let pull = try runLocal(executable: scpBinary, args: scpArgs, stdin: nil, timeout: 120)
-        // Regardless of pull outcome, try to clean up the remote tmp.
-        _ = try? runRemoteShell("rm -f \(Self.remotePathArg(remoteTmp))")
-        if pull.exitCode != 0 {
-            throw TransportError.classifySSHFailure(host: config.host, exitCode: pull.exitCode, stderr: pull.stderrString)
+
+        do {
+            let pull = try runLocal(executable: scpBinary, args: scpArgs, stdin: nil, timeout: scpTimeout)
+            // Best-effort cleanup of remote tmp regardless of outcome.
+            _ = try? runRemoteShell("rm -f \(Self.remotePathArg(remoteTmp))")
+            if pull.exitCode != 0 {
+                // Wipe the partial local file so a subsequent attempt
+                // doesn't try to open a corrupted SQLite database.
+                // Otherwise scp's truncate-on-write semantics leave a
+                // smaller-than-expected `.db` that sqlite_open succeeds
+                // on but every read returns garbage from.
+                try? FileManager.default.removeItem(atPath: localPath)
+                throw TransportError.classifySSHFailure(host: config.host, exitCode: pull.exitCode, stderr: pull.stderrString)
+            }
+        } catch let error as TransportError {
+            _ = try? runRemoteShell("rm -f \(Self.remotePathArg(remoteTmp))")
+            try? FileManager.default.removeItem(atPath: localPath)
+            // Rewrite "Command timed out after Ns" into something useful
+            // when it was the snapshot pull that hit the wall — generic
+            // timeout message gives the user no clue that the cause was
+            // a 5GB DB on a slow link.
+            if case .timeout(let secs, _) = error, remoteSize > 0 {
+                throw TransportError.other(
+                    message: "Snapshot transfer timed out after \(Int(secs))s pulling \(Self.formatBytes(remoteSize)) state.db from \(config.host). Try again on a faster network connection, or reduce the size of the remote state.db."
+                )
+            }
+            throw error
+        } catch {
+            _ = try? runRemoteShell("rm -f \(Self.remotePathArg(remoteTmp))")
+            try? FileManager.default.removeItem(atPath: localPath)
+            throw error
        }
+
        return URL(fileURLWithPath: localPath)
    }

+    /// Human-readable byte count for snapshot-pipeline error messages.
+    /// Wraps `ByteCountFormatter` for callers that just want
+    /// `"4.87 GB"` — used in the size-aware timeout / disk-space
+    /// errors emitted by `snapshotSQLite`.
+    nonisolated private static func formatBytes(_ bytes: Int64) -> String {
+        let formatter = ByteCountFormatter()
+        formatter.countStyle = .file
+        return formatter.string(fromByteCount: bytes)
+    }
+
    /// Path where the most recent successful snapshot was written —
    /// returned even when the remote is currently unreachable. The
    /// data service falls back to this when `snapshotSQLite` throws so