From e7096bb44c62861d47e436431e6c338a932d00e4 Mon Sep 17 00:00:00 2001 From: Alan Wizemann Date: Sat, 9 May 2026 20:14:15 +0200 Subject: [PATCH 1/2] fix(dashboard): coalesce file-watcher fires + dedupe in-flight loads (v0.13) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hermes v0.13 writes to state.db-wal and rotating logs at ~10 Hz during gateway activity (Checkpoints v2 single-store + session-durability writes hit disk far more often than v0.12). Each FSEvents fire on a watched core path was ticking HermesFileWatcher.lastChangeDate, which every observing view (Dashboard, Projects, ProjectSessions, half a dozen widgets) re-fired its `.onChange` / `.task(id:)` against. On Local hosts the dashboard stacked 5+ concurrent `viewModel.load()` calls in 200 ms, contending on the read-only state.db handle and surfacing as `BackendError error 3` (a sqlite step error from a busy/closed handle) plus visible flickering as isLoading thrashed. Two-part fix: 1. **HermesFileWatcher** coalesces FSEvents fires into one `lastChangeDate` mutation per 500 ms quiet window. A 10 Hz burst of FSEvents collapses into 2 observable mutations per second instead of 10. Both local FSEvents and remote-poll deltas funnel through the same `scheduleCoalescedTick` helper, so SSH contexts get the same protection. `stopWatching` cancels the pending timer alongside the sources so a tear-down doesn't fire one trailing mutation after. 2. **DashboardViewModel.load()** holds a single in-flight `Task` handle. When `.onChange` and `.task` race (or any future caller fires concurrently), the second caller awaits the first's completion instead of starting a parallel load. `isLoading` is no longer thrashed and the data-service refresh runs once per coalesced tick. Pre-v0.13 hosts see no behavioural change — they already wrote to state.db-wal at 1-2 Hz, well below the 500 ms coalesce window. v0.13 hosts now see a smooth dashboard that updates ~2 Hz during gateway activity instead of flickering at 10 Hz. Discovered during v2.8.0 dogfooding against a live v0.13.0 host. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Core/Services/HermesFileWatcher.swift | 42 ++++++++++++++++++- .../ViewModels/DashboardViewModel.swift | 33 +++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/scarf/scarf/Core/Services/HermesFileWatcher.swift b/scarf/scarf/Core/Services/HermesFileWatcher.swift index c43bff0..7c64429 100644 --- a/scarf/scarf/Core/Services/HermesFileWatcher.swift +++ b/scarf/scarf/Core/Services/HermesFileWatcher.swift @@ -15,6 +15,21 @@ final class HermesFileWatcher { /// the project list changes. private var remoteProjectPaths: [String] = [] + /// Coalescing timer for `lastChangeDate` ticks. v0.13 Hermes writes to + /// `state.db-wal` and rotating logs at ~10 Hz during gateway activity; + /// every observing view (`DashboardView`, `ProjectsView`, + /// `ProjectSessionsView`, half a dozen widgets) re-fires its `.onChange` + /// or `.task(id:)` on every tick, which stacked concurrent dashboard + /// loads on v0.13 hosts and tripped sqlite contention on the read-only + /// state.db handle. We coalesce to at most one tick per + /// `coalesceWindow` so a burst of FSEvents collapses into one observable + /// state mutation. 500 ms picks the smallest window that still feels + /// responsive on a single keystroke `touch dashboard.json` while + /// surviving v0.13's WAL-write storm. + private var pendingCoalesceTimer: DispatchWorkItem? + private var pendingTickDate: Date? + private static let coalesceWindow: TimeInterval = 0.5 + let context: ServerContext private let transport: any ServerTransport @@ -92,12 +107,32 @@ final class HermesFileWatcher { for await _ in stream { ScarfMon.event(.transport, "mac.fileWatcher.remoteDelta", count: 1) await MainActor.run { [weak self] in - self?.lastChangeDate = Date() + self?.scheduleCoalescedTick() } } } } + /// Coalesce a burst of FSEvents (or remote-poll deltas) into a single + /// `lastChangeDate` mutation after `coalesceWindow` seconds of quiet. + /// Each new fire records the latest event date and pushes the timer + /// out, so a 100-ms-spaced burst of 50 fires collapses to one observable + /// state mutation `coalesceWindow` ms after the LAST fire — same shape + /// as a debounce. Runs on `.main` (the FSEvents queue) so observers + /// see the publish on MainActor without a hop. + private func scheduleCoalescedTick() { + let now = Date() + pendingTickDate = now + pendingCoalesceTimer?.cancel() + let work = DispatchWorkItem { [weak self] in + guard let self, let date = self.pendingTickDate else { return } + self.pendingTickDate = nil + self.lastChangeDate = date + } + pendingCoalesceTimer = work + DispatchQueue.main.asyncAfter(deadline: .now() + Self.coalesceWindow, execute: work) + } + func stopWatching() { for source in coreSources + projectSources { source.cancel() @@ -108,6 +143,9 @@ final class HermesFileWatcher { timer = nil remotePollTask?.cancel() remotePollTask = nil + pendingCoalesceTimer?.cancel() + pendingCoalesceTimer = nil + pendingTickDate = nil } /// Watch each project's `dashboard.json` AND its enclosing `.scarf/` @@ -162,7 +200,7 @@ final class HermesFileWatcher { // message persisted); high counts when nothing's happening // suggest a runaway watcher install. ScarfMon.event(.transport, "mac.fileWatcher.localFire", count: 1) - self?.lastChangeDate = Date() + self?.scheduleCoalescedTick() } source.setCancelHandler { Darwin.close(fd) diff --git a/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift b/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift index fab639e..082f48d 100644 --- a/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift +++ b/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift @@ -7,6 +7,18 @@ final class DashboardViewModel { private let dataService: HermesDataService private let fileService: HermesFileService + /// Single in-flight load handle. The `.onChange(fileWatcher.lastChangeDate)` + /// observer in `DashboardView` plus `.task` on first appear can both + /// fire concurrent loads — and on v0.13 hosts the FSEvents tick rate + /// during gateway activity used to be high enough that 5+ loads + /// stacked inside 200 ms (HermesFileWatcher's coalesce window now + /// handles that, but defending here keeps the behaviour deterministic + /// on any future watcher chattiness). When a load is in flight, + /// subsequent triggers no-op; the in-flight load already has a + /// recent-enough snapshot for the user. + @ObservationIgnored + private var inFlightLoad: Task? + init(context: ServerContext = .local) { self.context = context self.dataService = HermesDataService(context: context) @@ -42,6 +54,27 @@ final class DashboardViewModel { var hermesShadows: [ProjectHermesShadowDetector.Shadow] = [] func load() async { + // Coalesce overlapping triggers: the `.task` first-appear and the + // `.onChange(fileWatcher.lastChangeDate)` observer can both fire + // a load in the same tick. Without this guard a v0.13 host's + // WAL-write storm walked over the previous load mid-snapshot + // (see HermesFileWatcher.scheduleCoalescedTick + the v2.8 dogfood + // bug report). If a load is already running, await its + // completion and return — the caller already has a fresh snapshot + // by the time `await` returns. + if let existing = inFlightLoad { + await existing.value + return + } + let task: Task = Task { @MainActor [weak self] in + await self?.loadImpl() + } + inFlightLoad = task + await task.value + inFlightLoad = nil + } + + private func loadImpl() async { isLoading = true // refresh() is essentially free for the streaming remote backend // (no transfer — every query is fresh) and a cheap reopen for From ce028b065f9b16fb5ba0317aa01336a25e837d29 Mon Sep 17 00:00:00 2001 From: Alan Wizemann Date: Sat, 9 May 2026 20:41:59 +0200 Subject: [PATCH 2/2] fix(dashboard): max-wait safeguard for scheduleCoalescedTick + drop forward-looking version label Two follow-ups from code review on this branch: 1. Add `maxWait` (1.5 s) safeguard to `HermesFileWatcher.scheduleCoalescedTick` so the trailing-debounce can't be starved indefinitely under sustained activity. Each scheduled fire now picks the earlier of (a) the `coalesceWindow` quiet floor and (b) `maxWait` since the FIRST fire of the current burst. A 10 Hz `state.db-wal` write storm coincident with a `gateway_state.json` Start/Stop touch now publishes within `maxWait` instead of waiting for the WAL activity to subside. The single-fire / quiet-burst case is unchanged because both deadlines reduce to the same value. 2. Drop the forward-looking "v2.8 dogfood bug report" reference from a comment in `DashboardViewModel.load()` per the `feedback_no_version_bumps.md` rule (release notes own version labels, not in-code comments). Tests: full ScarfCore suite green (450/450), Mac scheme builds clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Core/Services/HermesFileWatcher.swift | 56 +++++++++++++++---- .../ViewModels/DashboardViewModel.swift | 12 ++-- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/scarf/scarf/Core/Services/HermesFileWatcher.swift b/scarf/scarf/Core/Services/HermesFileWatcher.swift index 7c64429..8cdebc0 100644 --- a/scarf/scarf/Core/Services/HermesFileWatcher.swift +++ b/scarf/scarf/Core/Services/HermesFileWatcher.swift @@ -23,12 +23,26 @@ final class HermesFileWatcher { /// loads on v0.13 hosts and tripped sqlite contention on the read-only /// state.db handle. We coalesce to at most one tick per /// `coalesceWindow` so a burst of FSEvents collapses into one observable - /// state mutation. 500 ms picks the smallest window that still feels - /// responsive on a single keystroke `touch dashboard.json` while - /// surviving v0.13's WAL-write storm. + /// state mutation. + /// + /// **Two limits, not one.** A pure trailing-debounce would starve under + /// sustained WAL writes — the timer would keep getting cancelled and + /// rescheduled, and a coincident `gateway_state.json` Start/Stop touch + /// would never propagate until WAL activity quieted down. So we publish + /// when EITHER (a) `coalesceWindow` of quiet has elapsed since the last + /// fire, OR (b) `maxWait` has elapsed since the first fire of the + /// current burst — whichever comes first. The max-wait guarantees a + /// floor of one observable mutation per `maxWait` even during sustained + /// activity. Numbers picked to keep the dashboard responsive on a + /// single `touch` while surviving v0.13's WAL-write storm. private var pendingCoalesceTimer: DispatchWorkItem? private var pendingTickDate: Date? + /// Wall-clock when the current burst began. Set on the first + /// `scheduleCoalescedTick` fire after a quiet window; cleared whenever + /// the timer fires. Drives the `maxWait` floor below. + private var burstStartDate: Date? private static let coalesceWindow: TimeInterval = 0.5 + private static let maxWait: TimeInterval = 1.5 let context: ServerContext private let transport: any ServerTransport @@ -114,23 +128,44 @@ final class HermesFileWatcher { } /// Coalesce a burst of FSEvents (or remote-poll deltas) into a single - /// `lastChangeDate` mutation after `coalesceWindow` seconds of quiet. - /// Each new fire records the latest event date and pushes the timer - /// out, so a 100-ms-spaced burst of 50 fires collapses to one observable - /// state mutation `coalesceWindow` ms after the LAST fire — same shape - /// as a debounce. Runs on `.main` (the FSEvents queue) so observers - /// see the publish on MainActor without a hop. + /// `lastChangeDate` mutation. Two limits decide when the publish fires, + /// whichever comes first: + /// + /// 1. **Quiet window**: `coalesceWindow` seconds have elapsed since the + /// last fire. Each new fire pushes this out — pure debounce shape. + /// 2. **Max wait**: `maxWait` seconds have elapsed since the FIRST fire + /// of the current burst. This bounds the latency floor under + /// sustained activity (v0.13's ~10 Hz WAL-write storm) so a + /// coincident `gateway_state.json` Start/Stop touch can't be starved + /// indefinitely behind a continuously-rescheduling debounce timer. + /// + /// Runs on `.main` (the FSEvents queue and the remote-poll + /// MainActor.run) so observers see the publish on MainActor without a + /// hop. The work item self-clears `burstStartDate` when it fires so the + /// next burst starts a fresh max-wait window. private func scheduleCoalescedTick() { let now = Date() pendingTickDate = now + if burstStartDate == nil { + burstStartDate = now + } pendingCoalesceTimer?.cancel() + // Pick the deadline as the earlier of (a) `coalesceWindow` from now, + // and (b) `maxWait` from the burst start. The latter only matters + // when fires keep arriving faster than `coalesceWindow`; in the + // single-fire / quiet-burst case both reduce to the same value. + let quietDeadline = now.addingTimeInterval(Self.coalesceWindow) + let maxWaitDeadline = (burstStartDate ?? now).addingTimeInterval(Self.maxWait) + let firingDate = min(quietDeadline, maxWaitDeadline) + let delay = max(0, firingDate.timeIntervalSince(now)) let work = DispatchWorkItem { [weak self] in guard let self, let date = self.pendingTickDate else { return } self.pendingTickDate = nil + self.burstStartDate = nil self.lastChangeDate = date } pendingCoalesceTimer = work - DispatchQueue.main.asyncAfter(deadline: .now() + Self.coalesceWindow, execute: work) + DispatchQueue.main.asyncAfter(deadline: .now() + delay, execute: work) } func stopWatching() { @@ -146,6 +181,7 @@ final class HermesFileWatcher { pendingCoalesceTimer?.cancel() pendingCoalesceTimer = nil pendingTickDate = nil + burstStartDate = nil } /// Watch each project's `dashboard.json` AND its enclosing `.scarf/` diff --git a/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift b/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift index 082f48d..d7b9d42 100644 --- a/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift +++ b/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift @@ -56,12 +56,12 @@ final class DashboardViewModel { func load() async { // Coalesce overlapping triggers: the `.task` first-appear and the // `.onChange(fileWatcher.lastChangeDate)` observer can both fire - // a load in the same tick. Without this guard a v0.13 host's - // WAL-write storm walked over the previous load mid-snapshot - // (see HermesFileWatcher.scheduleCoalescedTick + the v2.8 dogfood - // bug report). If a load is already running, await its - // completion and return — the caller already has a fresh snapshot - // by the time `await` returns. + // a load in the same tick. Without this guard a Hermes v0.13 + // host's WAL-write storm walked over the previous load + // mid-snapshot (see `HermesFileWatcher.scheduleCoalescedTick`). + // If a load is already running, await its completion and return + // — the caller already has a fresh snapshot by the time `await` + // returns. if let existing = inFlightLoad { await existing.value return