mirror of
https://github.com/awizemann/scarf.git
synced 2026-05-08 02:14:37 +00:00
fix(chat): paginate session-load + race-guard against session switch
Two related bugs from remote-context perf captures. **Bug 1 — 30s timeout fetching the 157-message session.** The initial page size was 200 messages. For a session including `reasoning_content` from a thinking model, that produces enough JSON over `sqlite3 -json | ssh` to time out at exactly 30s on a 420ms-RTT remote, returning 0 rows. Bumping queryTimeout further just trades latency for stalls. Drop `HistoryPageSize.initial` from 200 → 50. Sized to fit comfortably inside the 30s queryTimeout; the existing "Load earlier" affordance pages back through older messages on demand. **Bug 2 — session-switch race silently swaps transcripts.** When the user picks a small chat while a slow fetch for a different chat is still in flight, the slow fetch finishes second and its `messages = …` assignment overwrites the small chat's transcript. User sees the small chat "jump back" to the big one. ScarfMon trace: parallel `mac.fetchMessages` events at t=641870 (small, 425ms, 2 rows) and t=643316 (big, 30,028ms timeout) — last write won. Add a `loadingForSession` capture and three guards: after the DB refresh, after the primary fetch, after the ACP-fork fetch. Each compares `self.sessionId` against the captured id; on mismatch fire `mac.hydrateMessages.dropped` and return without assigning. Race is silent in normal usage but visible in traces. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -32,10 +32,14 @@ public enum QueryDefaults: Sendable {
|
||||
/// consistent budget — and so we have one knob to retune if perf
|
||||
/// concerns shift.
|
||||
public enum HistoryPageSize: Sendable {
|
||||
/// Initial chat-history load: covers the vast majority of
|
||||
/// sessions in one fetch while keeping the snapshot read bounded
|
||||
/// for the rare 1000+-message session.
|
||||
public nonisolated static let initial = 200
|
||||
/// Initial chat-history load. **Sized to fit the SSH wire payload
|
||||
/// inside a 30-second `RemoteSQLiteBackend.queryTimeout`.** A
|
||||
/// 157-message session at 200-row page size produced enough
|
||||
/// JSON (with `reasoning_content` for thinking models) to time
|
||||
/// out at exactly 30 s on a 420 ms-RTT remote, returning empty.
|
||||
/// 50 rows comfortably fits that envelope. The "Load earlier"
|
||||
/// affordance pages back through older messages on demand.
|
||||
public nonisolated static let initial = 50
|
||||
/// Reconnection reconcile against the DB. 200 rows is plenty —
|
||||
/// disconnects don't generate hundreds of unseen messages.
|
||||
public nonisolated static let reconcile = 200
|
||||
|
||||
@@ -1019,6 +1019,18 @@ public final class RichChatViewModel {
|
||||
public func loadSessionHistory(sessionId: String, acpSessionId: String? = nil) async {
|
||||
await ScarfMon.measureAsync(.sessionLoad, "mac.hydrateMessages") {
|
||||
self.sessionId = sessionId
|
||||
// Capture the session-id we're loading FOR so we can verify
|
||||
// it's still the active one before assigning to `messages`.
|
||||
// Without this guard, switching to a small chat while a
|
||||
// larger one is mid-fetch can result in last-write-wins:
|
||||
// the slow fetch finishes after the small chat's, drops
|
||||
// the user back into the big chat's transcript, and the
|
||||
// user has to reselect the small one. Observed in remote
|
||||
// perf captures (parallel fetchMessages calls, one timing
|
||||
// out at 30s for a 157-message session, the other 2-message
|
||||
// chat completing in 425ms; the 30s one's assignment
|
||||
// overwrote the small chat).
|
||||
let loadingForSession = sessionId
|
||||
// Force a fresh snapshot pull on remote contexts. An earlier open()
|
||||
// would have cached a stale copy — on resume we need whatever
|
||||
// Hermes has actually persisted since then, or the resumed session
|
||||
@@ -1028,9 +1040,21 @@ public final class RichChatViewModel {
|
||||
// messages the agent streamed during the user's offline window.
|
||||
let opened = await dataService.refresh(forceFresh: true)
|
||||
guard opened else { return }
|
||||
// Race-check #1: session id may have changed during refresh.
|
||||
guard self.sessionId == loadingForSession else {
|
||||
ScarfMon.event(.sessionLoad, "mac.hydrateMessages.dropped", count: 1)
|
||||
return
|
||||
}
|
||||
|
||||
let pageSize = HistoryPageSize.initial
|
||||
var allMessages = await dataService.fetchMessages(sessionId: sessionId, limit: pageSize)
|
||||
// Race-check #2: session id may have changed during the
|
||||
// long fetch (the most common race — a 30s timeout on a
|
||||
// big session lets the user switch to a small one and back).
|
||||
guard self.sessionId == loadingForSession else {
|
||||
ScarfMon.event(.sessionLoad, "mac.hydrateMessages.dropped", count: 1)
|
||||
return
|
||||
}
|
||||
// The DB has more on-disk history when the initial fetch
|
||||
// saturated the limit. The "Load earlier" affordance reads
|
||||
// this flag.
|
||||
@@ -1043,6 +1067,11 @@ public final class RichChatViewModel {
|
||||
originSessionId = sessionId
|
||||
self.sessionId = acpId
|
||||
let acpMessages = await dataService.fetchMessages(sessionId: acpId, limit: pageSize)
|
||||
// Race-check #3: same guard, after the second fetch.
|
||||
guard self.sessionId == acpId else {
|
||||
ScarfMon.event(.sessionLoad, "mac.hydrateMessages.dropped", count: 1)
|
||||
return
|
||||
}
|
||||
if !acpMessages.isEmpty {
|
||||
allMessages.append(contentsOf: acpMessages)
|
||||
allMessages.sort { ($0.timestamp ?? .distantPast) < ($1.timestamp ?? .distantPast) }
|
||||
|
||||
Reference in New Issue
Block a user