feat(chat-resilience): iOS reconnect + snapshot fallback + paging + pill fix

Brings iOS chat to parity with Mac's reconnect behavior so a session survives phone-sleep, network handoffs, and SSH socket drops without losing the agent's work — Hermes already persists messages to state.db in real-time, the iOS app just had no resync path. Core changes (shared between Mac and iOS via ScarfCore): - ServerTransport.cachedSnapshotPath: fall back to the cached state.db snapshot when a fresh pull fails. HermesDataService surfaces this via isUsingStaleSnapshot + lastSnapshotMtime so views can render "Last updated X ago." Default opt-in via refresh(forceFresh: false); chat history reload passes forceFresh: true to refuse stale data. - HermesDataService.fetchMessages(sessionId:limit:before:): bounded pagination by id desc. Legacy unbounded overload deprecated. New HistoryPageSize constants centralize the budget. - RichChatViewModel.loadEarlier(): pages back through the current session via oldestLoadedMessageID + hasMoreHistory. iOS-only: - ChatController gains the Mac reconnect machinery: 5-attempt exponential backoff (1→16s) via session/resume → session/load, reconcileWithDB on success, "Resynced N new messages" toast. startACPEventLoop + startHealthMonitor extracted as helpers. - New NetworkReachabilityService (NWPathMonitor singleton). Suspends reconnect attempts while offline; kicks a fresh cycle on link-up. - ScarfGoCoordinator + ScarfGoTabRoot funnel scenePhase transitions to ChatController.handleScenePhase. On .active we verify channel health and reconnect if dead. - Draft persistence: UserDefaults keyed by (serverID, sessionID) survives force-quit. 7-day janitor at app launch. - Connection-state banner: .reconnecting and .offline render slim ScarfDesign-tinted strips above the message list. .failed keeps using the existing full-screen overlay. Bonus fix: - ConnectionStatusViewModel tier-2 probe now checks state.db instead of config.yaml. Hermes v0.11+ doesn't materialize config.yaml until the user changes a setting, so a freshly-installed working Hermes was being marked "degraded — config missing" indefinitely. state.db is the file Scarf actually depends on. Out of scope (deferred): APNs push notifications, BGTaskScheduler- based extended-background keepalive, offline write queue. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 18:44:45 +00:00 · 2026-04-27 21:57:49 +02:00
parent 587c6c36c8
commit 511726e2c0
16 changed files with 948 additions and 70 deletions
@@ -27,6 +27,28 @@ public enum QueryDefaults: Sendable {
    public nonisolated static let defaultSilenceThreshold = 200
 }

+/// Page sizes for `HermesDataService.fetchMessages(sessionId:limit:before:)`.
+/// Centralized so iOS, Mac, and the polling code paths can pick a
+/// consistent budget — and so we have one knob to retune if perf
+/// concerns shift.
+public enum HistoryPageSize: Sendable {
+    /// Initial chat-history load: covers the vast majority of
+    /// sessions in one fetch while keeping the snapshot read bounded
+    /// for the rare 1000+-message session.
+    public nonisolated static let initial = 200
+    /// Reconnection reconcile against the DB. 200 rows is plenty —
+    /// disconnects don't generate hundreds of unseen messages.
+    public nonisolated static let reconcile = 200
+    /// Mac sessions detail view. Larger to reduce paging UX in the
+    /// desktop browser-style read; the desktop has the screen real
+    /// estate and memory headroom for it.
+    public nonisolated static let macSessionDetail = 500
+    /// Terminal-mode polling refresh. Same 500-row budget as Mac
+    /// detail; covers sessions long enough that the user is actively
+    /// scrolling but bounded to keep each poll tick cheap.
+    public nonisolated static let polling = 500
+}
+
 // MARK: - File Size Formatting

 public enum FileSizeUnit: Sendable {
@@ -61,6 +61,26 @@ public actor HermesDataService {
    /// instead of an empty Dashboard with no explanation.
    public private(set) var lastOpenError: String?

+    /// Modification date of the underlying state.db that backs the
+    /// currently-open connection. For local contexts this tracks the
+    /// live DB's mtime; for remote contexts it's the cached snapshot's
+    /// mtime — which equals "when did we last get fresh data."
+    public private(set) var lastSnapshotMtime: Date?
+
+    /// True when a `snapshotSQLite` pull failed and the open succeeded
+    /// against a previously-cached snapshot instead of a fresh one.
+    /// Views render a "Last updated X ago" affordance when this is set
+    /// alongside `lastOpenError`. Always `false` for local contexts.
+    public private(set) var isUsingStaleSnapshot: Bool = false
+
+    /// Convenience: how long ago the cached snapshot was written, when
+    /// we're using a stale snapshot. `nil` when the snapshot is fresh
+    /// or no mtime could be read.
+    public var staleAge: TimeInterval? {
+        guard isUsingStaleSnapshot, let m = lastSnapshotMtime else { return nil }
+        return Date().timeIntervalSince(m)
+    }
+
    public let context: ServerContext
    private let transport: any ServerTransport

@@ -70,6 +90,18 @@ public actor HermesDataService {
    }

    public func open() async -> Bool {
+        await openInternal(forceFresh: false)
+    }
+
+    /// Variant that refuses the stale-snapshot fallback. Used by call
+    /// sites that genuinely need post-write consistency — most notably
+    /// the chat session-history reload, where a stale snapshot would
+    /// hide messages the agent just streamed.
+    private func openStrict() async -> Bool {
+        await openInternal(forceFresh: true)
+    }
+
+    private func openInternal(forceFresh: Bool) async -> Bool {
        if db != nil { return true }
        let localPath: String
        if context.isRemote {
@@ -86,10 +118,30 @@ public actor HermesDataService {
                )
                localPath = url.path
                lastOpenError = nil
+                isUsingStaleSnapshot = false
+                lastSnapshotMtime = mtime(at: url)
            } catch {
-                lastOpenError = humanize(error)
-                Self.logger.warning("snapshotSQLite failed: \(error.localizedDescription, privacy: .public)")
-                return false
+                // Fresh pull failed. If the caller demanded fresh data
+                // (`forceFresh: true`) OR there's no usable cache on
+                // disk, surface the error and bail. Otherwise serve
+                // the cached snapshot with `isUsingStaleSnapshot = true`
+                // so views can render a "Last updated X ago" banner.
+                if !forceFresh,
+                   let cached = transport.cachedSnapshotPath,
+                   FileManager.default.fileExists(atPath: cached.path)
+                {
+                    localPath = cached.path
+                    isUsingStaleSnapshot = true
+                    lastSnapshotMtime = mtime(at: cached)
+                    lastOpenError = humanize(error)   // user still sees why it's stale
+                    Self.logger.warning(
+                        "Using stale snapshot after pull failure: \(error.localizedDescription, privacy: .public)"
+                    )
+                } else {
+                    lastOpenError = humanize(error)
+                    Self.logger.warning("snapshotSQLite failed: \(error.localizedDescription, privacy: .public)")
+                    return false
+                }
            }
        } else {
            localPath = context.paths.stateDB
@@ -97,6 +149,8 @@ public actor HermesDataService {
                lastOpenError = "Hermes state database not found at \(localPath)."
                return false
            }
+            isUsingStaleSnapshot = false
+            lastSnapshotMtime = mtime(at: URL(fileURLWithPath: localPath))
        }
        // Remote snapshots are point-in-time copies that no one writes to;
        // opening them with `immutable=1` tells SQLite to skip WAL/SHM and
@@ -151,17 +205,27 @@ public actor HermesDataService {
        return desc
    }

-    /// Force a fresh snapshot pull + reopen. Used on session-load and in
-    /// any path that needs the UI to reflect writes Hermes just made.
-    /// Without this, remote snapshots would be frozen at the first `open()`
-    /// for the app's lifetime — new messages added to a resumed session
-    /// would never appear because the snapshot was pulled before they were
-    /// written. Local contexts pay essentially nothing: close+reopen on a
-    /// live DB is a no-op.
+    /// Close the current connection and re-open with a fresh snapshot
+    /// pull (when remote). When `forceFresh` is `false` (default) and
+    /// the snapshot pull fails, falls back to the cached snapshot —
+    /// `isUsingStaleSnapshot` is set so views can render a "Last
+    /// updated X ago" banner. Pass `forceFresh: true` from call sites
+    /// that genuinely need post-write consistency (chat session
+    /// history reload), where stale data would hide messages the
+    /// agent just streamed.
    @discardableResult
-    public func refresh() async -> Bool {
+    public func refresh(forceFresh: Bool = false) async -> Bool {
        close()
-        return await open()
+        return await openInternal(forceFresh: forceFresh)
+    }
+
+    /// Read the modification date of a local file. Returns `nil` if
+    /// the file is unreachable or has no mtime metadata. Used to
+    /// stamp `lastSnapshotMtime` so views can show "Last updated
+    /// X ago" without each one duplicating the FileManager dance.
+    private nonisolated func mtime(at url: URL) -> Date? {
+        let attrs = try? FileManager.default.attributesOfItem(atPath: url.path)
+        return attrs?[.modificationDate] as? Date
    }

    public func close() {
@@ -294,6 +358,50 @@ public actor HermesDataService {
        return cols
    }

+    /// Bounded message fetch keyed by message id (monotonic per row,
+    /// safer than timestamp-based pagination because streaming chunk
+    /// timestamps can collide). Returns the most recent `limit`
+    /// messages older than `before` (when supplied) in chronological
+    /// (ASC) order ready to display. Pass `before: nil` for the
+    /// initial load — the DB returns the newest `limit` rows.
+    public func fetchMessages(
+        sessionId: String,
+        limit: Int,
+        before: Int? = nil
+    ) -> [HermesMessage] {
+        guard let db else { return [] }
+        let sql: String
+        if before != nil {
+            sql = "SELECT \(messageColumns) FROM messages WHERE session_id = ? AND id < ? ORDER BY id DESC LIMIT ?"
+        } else {
+            sql = "SELECT \(messageColumns) FROM messages WHERE session_id = ? ORDER BY id DESC LIMIT ?"
+        }
+        var stmt: OpaquePointer?
+        guard sqlite3_prepare_v2(db, sql, -1, &stmt, nil) == SQLITE_OK else { return [] }
+        defer { sqlite3_finalize(stmt) }
+        sqlite3_bind_text(stmt, 1, sessionId, -1, sqliteTransient)
+        if let before {
+            sqlite3_bind_int(stmt, 2, Int32(before))
+            sqlite3_bind_int(stmt, 3, Int32(limit))
+        } else {
+            sqlite3_bind_int(stmt, 2, Int32(limit))
+        }
+
+        var messages: [HermesMessage] = []
+        while sqlite3_step(stmt) == SQLITE_ROW {
+            messages.append(messageFromRow(stmt!))
+        }
+        // Caller wants chronological (oldest-first) order; the SELECT
+        // is DESC for the LIMIT to bite the newest rows, so reverse.
+        return messages.reversed()
+    }
+
+    /// Legacy unbounded fetch retained for one release cycle so any
+    /// out-of-tree consumers don't break. New code should use the
+    /// bounded `fetchMessages(sessionId:limit:before:)` variant —
+    /// snapshot loads on 1000+-message sessions stall the UI when
+    /// they materialize the whole history at once.
+    @available(*, deprecated, message: "Use fetchMessages(sessionId:limit:before:) instead.")
    public func fetchMessages(sessionId: String) -> [HermesMessage] {
        guard let db else { return [] }
        let sql = "SELECT \(messageColumns) FROM messages WHERE session_id = ? ORDER BY timestamp ASC"
@@ -247,6 +247,11 @@ public struct LocalTransport: ServerTransport {
        URL(fileURLWithPath: remotePath)
    }

+    /// Local transport reads the live DB directly — there's no cached
+    /// snapshot to fall back to (and no failure mode where falling back
+    /// would help, since a missing local file is missing both ways).
+    public var cachedSnapshotPath: URL? { nil }
+
    // MARK: - Watching

    #if canImport(Darwin)
@@ -603,6 +603,14 @@ public struct SSHTransport: ServerTransport {
        return URL(fileURLWithPath: localPath)
    }

+    /// Path where the most recent successful snapshot was written —
+    /// returned even when the remote is currently unreachable. The
+    /// data service falls back to this when `snapshotSQLite` throws so
+    /// Dashboard / Sessions / Chat-history stay viewable offline.
+    public var cachedSnapshotPath: URL? {
+        URL(fileURLWithPath: snapshotDir + "/state.db")
+    }
+
    // MARK: - Watching

    public func watchPaths(_ paths: [String]) -> AsyncStream<WatchEvent> {
@@ -90,6 +90,19 @@ public protocol ServerTransport: Sendable {
    /// `~/Library/Caches/scarf/<serverID>/state.db`, returning that URL.
    nonisolated func snapshotSQLite(remotePath: String) throws -> URL

+    /// Local filesystem URL where this transport caches its SQLite snapshot,
+    /// returned even when the remote is unreachable. Callers should
+    /// `FileManager.default.fileExists(atPath:)` before reading — the
+    /// transport can't atomically check existence and return the URL
+    /// in one step without TOCTOU. Local transports return `nil`
+    /// (their data is the live DB, not a cache).
+    ///
+    /// Used by `HermesDataService.open()` to fall back to the last
+    /// successful snapshot when a fresh `snapshotSQLite` call fails,
+    /// so the app keeps showing data with a "Last updated X ago"
+    /// affordance instead of a blank screen.
+    nonisolated var cachedSnapshotPath: URL? { get }
+
    // MARK: - Watching

    /// Observe changes to a set of paths and yield events when any of them
@@ -16,7 +16,7 @@ public final class ConnectionStatusViewModel {
    #endif

    public enum Status: Equatable {
-        /// Healthy: SSH connected AND we can read `~/.hermes/config.yaml`.
+        /// Healthy: SSH connected AND we can read `~/.hermes/state.db`.
        case connected
        /// SSH connects but the follow-up read-access probe failed. Data
        /// views will be empty until this is resolved.
@@ -38,14 +38,17 @@ public final class ConnectionStatusViewModel {
    /// Specific tier-2 failure mode emitted by the probe script. Used to
    /// drive both the pill copy and the popover hint (issue #53).
    public enum DegradedCause: Equatable {
-        /// `config.yaml` is missing entirely. Most common cause: Hermes
-        /// hasn't run `setup` yet on this remote.
+        /// `state.db` is missing entirely. Most common cause: Hermes
+        /// is installed but no session has run on this remote yet.
+        /// Case name kept as `configMissing` for back-compat with
+        /// callers that pattern-match on it; "config" here is loose
+        /// for "Scarf's required state file."
        case configMissing
        /// `~/.hermes` itself doesn't exist. Hermes isn't installed for
        /// the SSH user on this host.
        case homeMissing
        /// File exists but the SSH user can't read it. Permission /
-        /// ownership mismatch.
+        /// ownership mismatch. Same back-compat note as above.
        case configUnreadable
        /// `~/.hermes/active_profile` points at a non-default Hermes
        /// profile and the configured Hermes home doesn't carry the
@@ -110,10 +113,18 @@ public final class ConnectionStatusViewModel {
        let hermesHome = context.paths.home
        // Two-tier probe in one SSH round-trip:
        //   tier 1: `true` — raw connectivity / auth / ControlMaster path
-        //   tier 2: `test -r $HERMESHOME/config.yaml` — can we actually
-        //           read the file Dashboard reads on every tick? Green pill
-        //           only if both pass; yellow "degraded" if tier 1 passes
-        //           but tier 2 fails (the exact symptom in issue #19).
+        //   tier 2: `test -r $HERMESHOME/state.db` — can we actually read
+        //           the file Dashboard / Sessions / Activity all hit on
+        //           every tick? Green pill only if both pass.
+        //
+        // Probe historically targeted `config.yaml`, but Hermes v0.11+
+        // doesn't materialize that file eagerly — it ships with sane
+        // defaults and only writes config.yaml when the user actually
+        // changes something. Result: a freshly-installed Hermes that's
+        // running, persisting sessions, and serving Scarf was being
+        // marked "degraded — config missing" indefinitely. `state.db`
+        // is created on first agent run and is the actual surface
+        // Scarf depends on, so we probe that instead.
        // Script emits two lines: TIER1:<exitcode> and TIER2:<exitcode>.
        let homeArg: String
        if hermesHome.hasPrefix("~/") {
@@ -124,22 +135,21 @@ public final class ConnectionStatusViewModel {
            homeArg = "\"\(hermesHome.replacingOccurrences(of: "\"", with: "\\\""))\""
        }
        // Probe emits a granular `TIER2:1:<cause>` code so the pill can
-        // surface a specific hint (issue #53) instead of the prior
-        // collapsed-to-binary "can't read config.yaml". Causes:
+        // surface a specific hint (issue #53). Causes:
        //   no-home — $H itself doesn't exist
-        //   missing — config.yaml absent
+        //   missing — state.db absent (Hermes hasn't been run yet)
        //   perm    — exists but unreadable by SSH user
-        //   profile:<name> — config missing AND ~/.hermes/active_profile
+        //   profile:<name> — state.db missing AND ~/.hermes/active_profile
        //                    points at a Hermes profile, suggesting Scarf
        //                    is reading the wrong dir
        let script = """
        echo TIER1:0
        H=\(homeArg)
-        if [ -r "$H/config.yaml" ]; then
+        if [ -r "$H/state.db" ]; then
          echo TIER2:0
        elif [ ! -d "$H" ]; then
          echo TIER2:1:no-home
-        elif [ ! -e "$H/config.yaml" ]; then
+        elif [ ! -e "$H/state.db" ]; then
          ACTIVE=""
          if [ -r "$HOME/.hermes/active_profile" ]; then
            ACTIVE=$(head -n1 "$HOME/.hermes/active_profile" 2>/dev/null | tr -d ' \\t\\r\\n')
@@ -263,23 +273,23 @@ public final class ConnectionStatusViewModel {
            )
        case .configMissing:
            return (
-                "Hermes hasn't been set up yet",
-                "`\(hermesHome)/config.yaml` is missing. Run `hermes setup` (or your first `hermes chat`) on the remote to create it. Scarf will go green automatically once it appears."
+                "Hermes hasn't been run yet",
+                "`\(hermesHome)/state.db` is missing — Hermes creates it on first agent run. Start any session on the remote (e.g. `hermes chat`) and Scarf will go green automatically."
            )
        case .configUnreadable:
            return (
-                "Permission denied on config.yaml",
-                "`\(hermesHome)/config.yaml` exists but the SSH user can't read it. Check ownership: `ls -l \(hermesHome)/config.yaml`. Either run Hermes as the SSH user, `chmod a+r` the file, or SSH as the Hermes user."
+                "Permission denied on state.db",
+                "`\(hermesHome)/state.db` exists but the SSH user can't read it. Check ownership: `ls -l \(hermesHome)/state.db`. Either run Hermes as the SSH user, `chmod a+r` the file, or SSH as the Hermes user."
            )
        case .profileActive(let name):
            return (
                "Hermes profile \"\(name)\" is active",
-                "The remote is using Hermes profile `\(name)` — its config lives at `~/.hermes/profiles/\(name)/config.yaml`, not `\(hermesHome)/config.yaml`. Either set this server's Hermes home to `~/.hermes/profiles/\(name)` in Manage Servers → Edit, or run `hermes profile use default` on the remote to revert."
+                "The remote is using Hermes profile `\(name)` — its state lives at `~/.hermes/profiles/\(name)/state.db`, not `\(hermesHome)/state.db`. Either set this server's Hermes home to `~/.hermes/profiles/\(name)` in Manage Servers → Edit, or run `hermes profile use default` on the remote to revert."
            )
        case .unknown:
            return (
                "Can't read Hermes state",
-                "SSH is fine but Scarf can't reach `\(hermesHome)/config.yaml`. Run diagnostics for a full breakdown."
+                "SSH is fine but Scarf can't reach `\(hermesHome)/state.db`. Run diagnostics for a full breakdown."
            )
        }
    }
@@ -339,6 +339,20 @@ public final class RichChatViewModel {
    /// The original CLI session ID when resuming a CLI session via ACP.
    /// Used to combine old CLI messages with new ACP messages.
    public private(set) var originSessionId: String?
+    /// Smallest DB id currently loaded for the *current session* (i.e.
+    /// `sessionId`). Drives `loadEarlier()`: page back with
+    /// `before: oldestLoadedMessageID`. `nil` when nothing has been
+    /// loaded yet or the session has no DB-persisted messages.
+    public private(set) var oldestLoadedMessageID: Int?
+    /// Whether the most recent fetch suggests there are more older
+    /// messages on disk that haven't been loaded into `messages` yet.
+    /// Set to `true` when the initial fetch returned exactly `limit`
+    /// rows (a strong hint the table has more). Drives the "Load
+    /// earlier" button visibility in chat views.
+    public private(set) var hasMoreHistory: Bool = false
+    /// Cleared during a `loadEarlier()` fetch so the UI can show a
+    /// spinner and we don't fan out duplicate page requests.
+    public private(set) var isLoadingEarlier: Bool = false
    private var nextLocalId = -1
    private var streamingAssistantText = ""
    private var streamingThinkingText = ""
@@ -382,6 +396,9 @@ public final class RichChatViewModel {
        lastKnownFingerprint = nil
        sessionId = nil
        originSessionId = nil
+        oldestLoadedMessageID = nil
+        hasMoreHistory = false
+        isLoadingEarlier = false
        isAgentWorking = false
        userSendPending = false
        resetTimestamp = Date()
@@ -875,12 +892,15 @@ public final class RichChatViewModel {
        let opened = await dataService.open()
        guard opened else { return }

-        var dbMessages = await dataService.fetchMessages(sessionId: sessionId)
+        // Reconnects don't generate hundreds of unseen messages, so a
+        // 200-row tail is plenty for the merge — and it keeps us from
+        // re-materializing 1000+ message sessions on every reconnect.
+        var dbMessages = await dataService.fetchMessages(sessionId: sessionId, limit: HistoryPageSize.reconcile)

        // If we have an origin session (CLI session continued via ACP),
        // include those messages too
        if let origin = originSessionId, origin != sessionId {
-            let originMessages = await dataService.fetchMessages(sessionId: origin)
+            let originMessages = await dataService.fetchMessages(sessionId: origin, limit: HistoryPageSize.reconcile)
            if !originMessages.isEmpty {
                dbMessages = originMessages + dbMessages
                dbMessages.sort { ($0.timestamp ?? .distantPast) < ($1.timestamp ?? .distantPast) }
@@ -925,10 +945,18 @@ public final class RichChatViewModel {
        // would have cached a stale copy — on resume we need whatever
        // Hermes has actually persisted since then, or the resumed session
        // will show only history up to the moment the snapshot was taken.
-        let opened = await dataService.refresh()
+        // `forceFresh: true` refuses the stale-snapshot fallback the data
+        // service grew in M11 — falling back here would silently hide
+        // messages the agent streamed during the user's offline window.
+        let opened = await dataService.refresh(forceFresh: true)
        guard opened else { return }

-        var allMessages = await dataService.fetchMessages(sessionId: sessionId)
+        let pageSize = HistoryPageSize.initial
+        var allMessages = await dataService.fetchMessages(sessionId: sessionId, limit: pageSize)
+        // The DB has more on-disk history when the initial fetch
+        // saturated the limit. The "Load earlier" affordance reads
+        // this flag.
+        var moreHistory = allMessages.count >= pageSize
        let session = await dataService.fetchSession(id: sessionId)

        // If the ACP session is different from the origin, load its messages too
@@ -936,10 +964,11 @@ public final class RichChatViewModel {
        if let acpId = acpSessionId, acpId != sessionId {
            originSessionId = sessionId
            self.sessionId = acpId
-            let acpMessages = await dataService.fetchMessages(sessionId: acpId)
+            let acpMessages = await dataService.fetchMessages(sessionId: acpId, limit: pageSize)
            if !acpMessages.isEmpty {
                allMessages.append(contentsOf: acpMessages)
                allMessages.sort { ($0.timestamp ?? .distantPast) < ($1.timestamp ?? .distantPast) }
+                moreHistory = moreHistory || acpMessages.count >= pageSize
            }
        }

@@ -947,6 +976,51 @@ public final class RichChatViewModel {
        currentSession = session
        let minId = allMessages.map(\.id).min() ?? 0
        nextLocalId = min(minId - 1, -1)
+        // Track the oldest loaded id from THIS session (not the merged
+        // origin) so `loadEarlier()` pages back through the live ACP
+        // session's history. Cross-session backfill (paging into the
+        // CLI origin) isn't supported in v1 — the merged 2× pageSize
+        // is enough headroom for the dashboard-resume case.
+        let currentSessionId = self.sessionId ?? sessionId
+        oldestLoadedMessageID = allMessages
+            .filter { $0.sessionId == currentSessionId }
+            .map(\.id)
+            .min()
+        hasMoreHistory = moreHistory
+        buildMessageGroups()
+    }
+
+    // MARK: - Load Earlier (pagination)
+
+    /// Page back through the current session's DB-persisted history
+    /// before `oldestLoadedMessageID` and prepend the page to
+    /// `messages`. Cheap on the SQLite side (`id` is the primary
+    /// key); the cost is the data-service `open()` round-trip on
+    /// remote contexts. `pageSize` defaults to the same 200-row
+    /// budget as the initial load.
+    public func loadEarlier(pageSize: Int = HistoryPageSize.initial) async {
+        guard !isLoadingEarlier, hasMoreHistory else { return }
+        guard let sessionId, let oldest = oldestLoadedMessageID else { return }
+        isLoadingEarlier = true
+        defer { isLoadingEarlier = false }
+
+        let opened = await dataService.open()
+        guard opened else { return }
+
+        let older = await dataService.fetchMessages(
+            sessionId: sessionId,
+            limit: pageSize,
+            before: oldest
+        )
+        guard !older.isEmpty else {
+            hasMoreHistory = false
+            return
+        }
+        messages.insert(contentsOf: older, at: 0)
+        oldestLoadedMessageID = older.first?.id
+        // If this fetch returned fewer than the page size we've hit
+        // the bottom of the table — no further pages worth fetching.
+        hasMoreHistory = older.count >= pageSize
        buildMessageGroups()
    }

@@ -990,7 +1064,7 @@ public final class RichChatViewModel {
        let fingerprint = await dataService.fetchMessageFingerprint(sessionId: sessionId)

        if fingerprint != lastKnownFingerprint {
-            let fetched = await dataService.fetchMessages(sessionId: sessionId)
+            let fetched = await dataService.fetchMessages(sessionId: sessionId, limit: HistoryPageSize.polling)
            let session = await dataService.fetchSession(id: sessionId)
            lastKnownFingerprint = fingerprint

@@ -165,6 +165,15 @@ public final class CitadelServerTransport: ServerTransport, @unchecked Sendable
        try runSync { try await self.asyncSnapshotSQLite(remotePath: remotePath) }
    }

+    /// Path where the most recent successful snapshot was written —
+    /// returned even when the SSH connection is currently down. The
+    /// data service falls back to this when `snapshotSQLite` throws so
+    /// Dashboard / Sessions / Chat-history stay viewable while the
+    /// phone is offline.
+    public var cachedSnapshotPath: URL? {
+        snapshotBaseDir.appendingPathComponent("state.db")
+    }
+
    // MARK: - ServerTransport: watching

    public func watchPaths(_ paths: [String]) -> AsyncStream<WatchEvent> {
@@ -0,0 +1,88 @@
+import Foundation
+import Network
+import Observation
+#if canImport(os)
+import os
+#endif
+
+/// Process-wide reachability monitor wrapping `NWPathMonitor`. Used by
+/// `ChatController` to decide when to attempt a reconnect (on
+/// `.satisfied`) vs. mark the chat offline (on `.unsatisfied`).
+///
+/// Singleton because `NWPathMonitor` is per-process by design — there's
+/// no benefit to instantiating multiple monitors and the cost (a small
+/// background queue per instance) accumulates if every controller
+/// spawns its own.
+///
+/// ## Usage
+///
+/// Don't read the published state from a SwiftUI view body — the
+/// runtime samples through `NWPathMonitor`'s queue, but a `body`
+/// re-evaluation that touches `currentPath` directly would block. Read
+/// `isSatisfied` / observe `transitionTick` instead. Tests and
+/// non-iOS callers can use the no-op default behavior (`isSatisfied`
+/// reports `true`).
+@Observable
+@MainActor
+public final class NetworkReachabilityService {
+    public static let shared = NetworkReachabilityService()
+
+    /// `true` when the OS reports a usable network path (any
+    /// interface). Inverted via `!isSatisfied` for "we're offline."
+    public private(set) var isSatisfied: Bool = true
+
+    /// Mirrors `NWPath.isExpensive`. Useful as a hint to UI for not
+    /// auto-fetching big payloads on cellular. Not consumed yet —
+    /// reserved so callers don't have to add another property later.
+    public private(set) var isExpensive: Bool = false
+
+    /// Monotonic counter that bumps every time `isSatisfied` changes.
+    /// Views observe `transitionTick` rather than `isSatisfied` to
+    /// kick a `.onChange` even if the value is the same as before
+    /// (rare but possible during rapid network flapping).
+    public private(set) var transitionTick: Int = 0
+
+    private let monitor = NWPathMonitor()
+    private let queue = DispatchQueue(label: "com.scarf.ios.reachability")
+
+    #if canImport(os)
+    private static let logger = Logger(subsystem: "com.scarf.ios", category: "NetworkReachability")
+    #endif
+
+    private init() {
+        // Seed from the current path synchronously so first reads on
+        // launch don't show "satisfied" while the OS reports otherwise.
+        // `currentPath` is safe here at init (the monitor hasn't been
+        // started yet, no queue handler is firing).
+        let initial = monitor.currentPath
+        self.isSatisfied = (initial.status == .satisfied)
+        self.isExpensive = initial.isExpensive
+
+        monitor.pathUpdateHandler = { [weak self] path in
+            // Bounce back through MainActor — the `Observable`
+            // protocol's published-property invariants require main-
+            // thread mutation. The pathUpdateHandler is invoked on
+            // `queue`, which is a private background queue.
+            Task { @MainActor in
+                guard let self else { return }
+                let satisfied = (path.status == .satisfied)
+                if self.isSatisfied != satisfied {
+                    self.isSatisfied = satisfied
+                    self.transitionTick &+= 1
+                    #if canImport(os)
+                    Self.logger.info(
+                        "Reachability transition: \(satisfied ? "satisfied" : "unsatisfied", privacy: .public)"
+                    )
+                    #endif
+                }
+                self.isExpensive = path.isExpensive
+            }
+        }
+        monitor.start(queue: queue)
+    }
+
+    deinit {
+        // Singleton is process-lifetime; this only runs on shutdown.
+        monitor.cancel()
+    }
+}