mirror of
https://github.com/awizemann/scarf.git
synced 2026-05-10 10:36:35 +00:00
perf(chat): exclude reasoning_content from initial fetch + drop page size to 25
The 160-message thinking-model session still timed out at the 30s
ceiling even after dropping page size 200→50 in commit a193003.
ScarfMon trace:
mac.fetchMessages 30,105,329,125 ns ← 30s timeout fired
mac.hydrateMessages.rows count=1 ← 1 partial row only
Root cause: `reasoning_content` is huge on thinking models (20+
KB per row). Even 50 rows × 30 KB = 1.5 MB JSON shipping over a
420ms-RTT remote SSH channel exceeds the budget. The chat
appeared empty AGAIN.
Two cuts:
1. **`messageColumnsLight`** — same as messageColumns but omits
`reasoning_content`. Used by `fetchMessages` so the bulk
wire payload is small. `messageFromRow` reads
reasoning_content via `row.optionalString(at: 11)` which
gracefully returns nil when the column isn't present, so the
shape change is transparent.
2. **`fetchReasoningContent(for:)`** — single-row lazy fetch
the inspector pane calls when the user expands a thinking
disclosure. One small SSH round-trip per inspection vs. paying
for ALL reasoning content on every session boot.
3. **`HistoryPageSize.initial` 50 → 25** — sized for the lite
column shape with margin for sessions that include some heavy
tool-call payloads. The "Load earlier" affordance still
pages back through older messages.
Net effect on the user-reported case: 160-message session loads
the most-recent 25 messages in ~5-10s (one SSH round-trip ~420ms
plus ~3 KB × 25 = 75 KB wire). The remaining 135 are reachable
via Load earlier.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -36,10 +36,17 @@ public enum HistoryPageSize: Sendable {
|
|||||||
/// inside a 30-second `RemoteSQLiteBackend.queryTimeout`.** A
|
/// inside a 30-second `RemoteSQLiteBackend.queryTimeout`.** A
|
||||||
/// 157-message session at 200-row page size produced enough
|
/// 157-message session at 200-row page size produced enough
|
||||||
/// JSON (with `reasoning_content` for thinking models) to time
|
/// JSON (with `reasoning_content` for thinking models) to time
|
||||||
/// out at exactly 30 s on a 420 ms-RTT remote, returning empty.
|
/// out at exactly 30 s on a 420 ms-RTT remote. Dropped to 50,
|
||||||
/// 50 rows comfortably fits that envelope. The "Load earlier"
|
/// then to 25 in v2.7 after a 160-message session still timed
|
||||||
|
/// out at 50 — `reasoning_content` for thinking-model turns can
|
||||||
|
/// run 20+ KB per row, so 50 rows × 30 KB = 1.5 MB JSON which
|
||||||
|
/// over a slow SSH channel still trips the 30s budget. Pair
|
||||||
|
/// with `messageColumnsLight` (excludes `reasoning_content`)
|
||||||
|
/// so the on-wire payload is small even at this size; the
|
||||||
|
/// inspector pane lazy-loads via `fetchReasoningContent(for:)`
|
||||||
|
/// when the user expands a disclosure. The "Load earlier"
|
||||||
/// affordance pages back through older messages on demand.
|
/// affordance pages back through older messages on demand.
|
||||||
public nonisolated static let initial = 50
|
public nonisolated static let initial = 25
|
||||||
/// Reconnection reconcile against the DB. 200 rows is plenty —
|
/// Reconnection reconcile against the DB. 200 rows is plenty —
|
||||||
/// disconnects don't generate hundreds of unseen messages.
|
/// disconnects don't generate hundreds of unseen messages.
|
||||||
public nonisolated static let reconcile = 200
|
public nonisolated static let reconcile = 200
|
||||||
|
|||||||
@@ -142,6 +142,31 @@ public actor HermesDataService {
|
|||||||
return cols
|
return cols
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Same as `messageColumns` but with the `reasoning_content`
|
||||||
|
/// column omitted. v0.11+ Hermes thinking-model output stores
|
||||||
|
/// the full chain-of-thought transcript in `reasoning_content`,
|
||||||
|
/// which on a single message can be 20+ KB of JSON. For a
|
||||||
|
/// 160-message session that's >1 MB of wire payload — enough
|
||||||
|
/// to time out a 30s SSH `sqlite3 -json` fetch on a 420ms-RTT
|
||||||
|
/// remote (perf capture confirmed). The bubble's main body
|
||||||
|
/// doesn't render reasoning_content directly; the inspector
|
||||||
|
/// pane does, and the user opens that on demand. So initial
|
||||||
|
/// fetch can skip it and a follow-up `fetchReasoningContent`
|
||||||
|
/// can pull it lazily when the inspector opens.
|
||||||
|
private var messageColumnsLight: String {
|
||||||
|
var cols = """
|
||||||
|
id, session_id, role, content, tool_call_id, tool_calls,
|
||||||
|
tool_name, timestamp, token_count, finish_reason
|
||||||
|
"""
|
||||||
|
if hasV07Schema {
|
||||||
|
cols += ", reasoning"
|
||||||
|
}
|
||||||
|
// v0.11+ `reasoning_content` is intentionally excluded.
|
||||||
|
// `messageFromRow` defaults it to nil; callers that need it
|
||||||
|
// call `fetchReasoningContent(for:)` to lazy-load.
|
||||||
|
return cols
|
||||||
|
}
|
||||||
|
|
||||||
// MARK: - Session Queries
|
// MARK: - Session Queries
|
||||||
|
|
||||||
public func fetchSessions(limit: Int = QueryDefaults.sessionLimit) async -> [HermesSession] {
|
public func fetchSessions(limit: Int = QueryDefaults.sessionLimit) async -> [HermesSession] {
|
||||||
@@ -189,13 +214,19 @@ public actor HermesDataService {
|
|||||||
before: Int? = nil
|
before: Int? = nil
|
||||||
) async -> [HermesMessage] {
|
) async -> [HermesMessage] {
|
||||||
await ScarfMon.measureAsync(.sessionLoad, "mac.fetchMessages") {
|
await ScarfMon.measureAsync(.sessionLoad, "mac.fetchMessages") {
|
||||||
|
// Use the lite column set — excludes reasoning_content which
|
||||||
|
// can be 20+ KB per message on thinking-model sessions and
|
||||||
|
// was the cause of repeated 30s SSH timeouts on 100+-message
|
||||||
|
// sessions over 420ms-RTT remote links. The inspector pane
|
||||||
|
// calls `fetchReasoningContent(for:)` to lazy-load when the
|
||||||
|
// user opens a message's disclosure.
|
||||||
let sql: String
|
let sql: String
|
||||||
let params: [SQLValue]
|
let params: [SQLValue]
|
||||||
if let before {
|
if let before {
|
||||||
sql = "SELECT \(messageColumns) FROM messages WHERE session_id = ? AND id < ? ORDER BY id DESC LIMIT ?"
|
sql = "SELECT \(messageColumnsLight) FROM messages WHERE session_id = ? AND id < ? ORDER BY id DESC LIMIT ?"
|
||||||
params = [.text(sessionId), .integer(Int64(before)), .integer(Int64(limit))]
|
params = [.text(sessionId), .integer(Int64(before)), .integer(Int64(limit))]
|
||||||
} else {
|
} else {
|
||||||
sql = "SELECT \(messageColumns) FROM messages WHERE session_id = ? ORDER BY id DESC LIMIT ?"
|
sql = "SELECT \(messageColumnsLight) FROM messages WHERE session_id = ? ORDER BY id DESC LIMIT ?"
|
||||||
params = [.text(sessionId), .integer(Int64(limit))]
|
params = [.text(sessionId), .integer(Int64(limit))]
|
||||||
}
|
}
|
||||||
do {
|
do {
|
||||||
@@ -211,6 +242,23 @@ public actor HermesDataService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Lazy-load the `reasoning_content` for a single message. Called
|
||||||
|
/// when the user expands the inspector disclosure on a thinking-model
|
||||||
|
/// reply that has reasoning available (i.e. the message has v0.11
|
||||||
|
/// schema). Cheap on a single message — avoids the bulk-fetch
|
||||||
|
/// payload-size problem that motivated `messageColumnsLight`.
|
||||||
|
public func fetchReasoningContent(for messageId: Int) async -> String? {
|
||||||
|
guard hasV011Schema else { return nil }
|
||||||
|
let sql = "SELECT reasoning_content FROM messages WHERE id = ?"
|
||||||
|
do {
|
||||||
|
let rows = try await backend.query(sql, params: [.integer(Int64(messageId))])
|
||||||
|
return rows.first?.optionalString(at: 0)
|
||||||
|
} catch {
|
||||||
|
Self.logger.warning("fetchReasoningContent failed: \(error.localizedDescription, privacy: .public)")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Legacy unbounded fetch retained for one release cycle so any
|
/// Legacy unbounded fetch retained for one release cycle so any
|
||||||
/// out-of-tree consumers don't break. New code should use the
|
/// out-of-tree consumers don't break. New code should use the
|
||||||
/// bounded `fetchMessages(sessionId:limit:before:)` variant —
|
/// bounded `fetchMessages(sessionId:limit:before:)` variant —
|
||||||
|
|||||||
Reference in New Issue
Block a user