From a19300384264a9ecd0203012ca6f08841eb97a94 Mon Sep 17 00:00:00 2001 From: Alan Wizemann Date: Tue, 5 May 2026 12:38:19 +0200 Subject: [PATCH] fix(chat): paginate session-load + race-guard against session switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related bugs from remote-context perf captures. **Bug 1 — 30s timeout fetching the 157-message session.** The initial page size was 200 messages. For a session including `reasoning_content` from a thinking model, that produces enough JSON over `sqlite3 -json | ssh` to time out at exactly 30s on a 420ms-RTT remote, returning 0 rows. Bumping queryTimeout further just trades latency for stalls. Drop `HistoryPageSize.initial` from 200 → 50. Sized to fit comfortably inside the 30s queryTimeout; the existing "Load earlier" affordance pages back through older messages on demand. **Bug 2 — session-switch race silently swaps transcripts.** When the user picks a small chat while a slow fetch for a different chat is still in flight, the slow fetch finishes second and its `messages = …` assignment overwrites the small chat's transcript. User sees the small chat "jump back" to the big one. ScarfMon trace: parallel `mac.fetchMessages` events at t=641870 (small, 425ms, 2 rows) and t=643316 (big, 30,028ms timeout) — last write won. Add a `loadingForSession` capture and three guards: after the DB refresh, after the primary fetch, after the ACP-fork fetch. Each compares `self.sessionId` against the captured id; on mismatch fire `mac.hydrateMessages.dropped` and return without assigning. Race is silent in normal usage but visible in traces. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ScarfCore/Models/HermesConstants.swift | 12 +++++--- .../ViewModels/RichChatViewModel.swift | 29 +++++++++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/scarf/Packages/ScarfCore/Sources/ScarfCore/Models/HermesConstants.swift b/scarf/Packages/ScarfCore/Sources/ScarfCore/Models/HermesConstants.swift index 6c21d1b..99c1941 100644 --- a/scarf/Packages/ScarfCore/Sources/ScarfCore/Models/HermesConstants.swift +++ b/scarf/Packages/ScarfCore/Sources/ScarfCore/Models/HermesConstants.swift @@ -32,10 +32,14 @@ public enum QueryDefaults: Sendable { /// consistent budget — and so we have one knob to retune if perf /// concerns shift. public enum HistoryPageSize: Sendable { - /// Initial chat-history load: covers the vast majority of - /// sessions in one fetch while keeping the snapshot read bounded - /// for the rare 1000+-message session. - public nonisolated static let initial = 200 + /// Initial chat-history load. **Sized to fit the SSH wire payload + /// inside a 30-second `RemoteSQLiteBackend.queryTimeout`.** A + /// 157-message session at 200-row page size produced enough + /// JSON (with `reasoning_content` for thinking models) to time + /// out at exactly 30 s on a 420 ms-RTT remote, returning empty. + /// 50 rows comfortably fits that envelope. The "Load earlier" + /// affordance pages back through older messages on demand. + public nonisolated static let initial = 50 /// Reconnection reconcile against the DB. 200 rows is plenty — /// disconnects don't generate hundreds of unseen messages. public nonisolated static let reconcile = 200 diff --git a/scarf/Packages/ScarfCore/Sources/ScarfCore/ViewModels/RichChatViewModel.swift b/scarf/Packages/ScarfCore/Sources/ScarfCore/ViewModels/RichChatViewModel.swift index 8d6afa3..07e96c3 100644 --- a/scarf/Packages/ScarfCore/Sources/ScarfCore/ViewModels/RichChatViewModel.swift +++ b/scarf/Packages/ScarfCore/Sources/ScarfCore/ViewModels/RichChatViewModel.swift @@ -1019,6 +1019,18 @@ public final class RichChatViewModel { public func loadSessionHistory(sessionId: String, acpSessionId: String? = nil) async { await ScarfMon.measureAsync(.sessionLoad, "mac.hydrateMessages") { self.sessionId = sessionId + // Capture the session-id we're loading FOR so we can verify + // it's still the active one before assigning to `messages`. + // Without this guard, switching to a small chat while a + // larger one is mid-fetch can result in last-write-wins: + // the slow fetch finishes after the small chat's, drops + // the user back into the big chat's transcript, and the + // user has to reselect the small one. Observed in remote + // perf captures (parallel fetchMessages calls, one timing + // out at 30s for a 157-message session, the other 2-message + // chat completing in 425ms; the 30s one's assignment + // overwrote the small chat). + let loadingForSession = sessionId // Force a fresh snapshot pull on remote contexts. An earlier open() // would have cached a stale copy — on resume we need whatever // Hermes has actually persisted since then, or the resumed session @@ -1028,9 +1040,21 @@ public final class RichChatViewModel { // messages the agent streamed during the user's offline window. let opened = await dataService.refresh(forceFresh: true) guard opened else { return } + // Race-check #1: session id may have changed during refresh. + guard self.sessionId == loadingForSession else { + ScarfMon.event(.sessionLoad, "mac.hydrateMessages.dropped", count: 1) + return + } let pageSize = HistoryPageSize.initial var allMessages = await dataService.fetchMessages(sessionId: sessionId, limit: pageSize) + // Race-check #2: session id may have changed during the + // long fetch (the most common race — a 30s timeout on a + // big session lets the user switch to a small one and back). + guard self.sessionId == loadingForSession else { + ScarfMon.event(.sessionLoad, "mac.hydrateMessages.dropped", count: 1) + return + } // The DB has more on-disk history when the initial fetch // saturated the limit. The "Load earlier" affordance reads // this flag. @@ -1043,6 +1067,11 @@ public final class RichChatViewModel { originSessionId = sessionId self.sessionId = acpId let acpMessages = await dataService.fetchMessages(sessionId: acpId, limit: pageSize) + // Race-check #3: same guard, after the second fetch. + guard self.sessionId == acpId else { + ScarfMon.event(.sessionLoad, "mac.hydrateMessages.dropped", count: 1) + return + } if !acpMessages.isEmpty { allMessages.append(contentsOf: acpMessages) allMessages.sort { ($0.timestamp ?? .distantPast) < ($1.timestamp ?? .distantPast) }