From e7096bb44c62861d47e436431e6c338a932d00e4 Mon Sep 17 00:00:00 2001 From: Alan Wizemann Date: Sat, 9 May 2026 20:14:15 +0200 Subject: [PATCH] fix(dashboard): coalesce file-watcher fires + dedupe in-flight loads (v0.13) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hermes v0.13 writes to state.db-wal and rotating logs at ~10 Hz during gateway activity (Checkpoints v2 single-store + session-durability writes hit disk far more often than v0.12). Each FSEvents fire on a watched core path was ticking HermesFileWatcher.lastChangeDate, which every observing view (Dashboard, Projects, ProjectSessions, half a dozen widgets) re-fired its `.onChange` / `.task(id:)` against. On Local hosts the dashboard stacked 5+ concurrent `viewModel.load()` calls in 200 ms, contending on the read-only state.db handle and surfacing as `BackendError error 3` (a sqlite step error from a busy/closed handle) plus visible flickering as isLoading thrashed. Two-part fix: 1. **HermesFileWatcher** coalesces FSEvents fires into one `lastChangeDate` mutation per 500 ms quiet window. A 10 Hz burst of FSEvents collapses into 2 observable mutations per second instead of 10. Both local FSEvents and remote-poll deltas funnel through the same `scheduleCoalescedTick` helper, so SSH contexts get the same protection. `stopWatching` cancels the pending timer alongside the sources so a tear-down doesn't fire one trailing mutation after. 2. **DashboardViewModel.load()** holds a single in-flight `Task` handle. When `.onChange` and `.task` race (or any future caller fires concurrently), the second caller awaits the first's completion instead of starting a parallel load. `isLoading` is no longer thrashed and the data-service refresh runs once per coalesced tick. Pre-v0.13 hosts see no behavioural change — they already wrote to state.db-wal at 1-2 Hz, well below the 500 ms coalesce window. v0.13 hosts now see a smooth dashboard that updates ~2 Hz during gateway activity instead of flickering at 10 Hz. Discovered during v2.8.0 dogfooding against a live v0.13.0 host. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Core/Services/HermesFileWatcher.swift | 42 ++++++++++++++++++- .../ViewModels/DashboardViewModel.swift | 33 +++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/scarf/scarf/Core/Services/HermesFileWatcher.swift b/scarf/scarf/Core/Services/HermesFileWatcher.swift index c43bff0..7c64429 100644 --- a/scarf/scarf/Core/Services/HermesFileWatcher.swift +++ b/scarf/scarf/Core/Services/HermesFileWatcher.swift @@ -15,6 +15,21 @@ final class HermesFileWatcher { /// the project list changes. private var remoteProjectPaths: [String] = [] + /// Coalescing timer for `lastChangeDate` ticks. v0.13 Hermes writes to + /// `state.db-wal` and rotating logs at ~10 Hz during gateway activity; + /// every observing view (`DashboardView`, `ProjectsView`, + /// `ProjectSessionsView`, half a dozen widgets) re-fires its `.onChange` + /// or `.task(id:)` on every tick, which stacked concurrent dashboard + /// loads on v0.13 hosts and tripped sqlite contention on the read-only + /// state.db handle. We coalesce to at most one tick per + /// `coalesceWindow` so a burst of FSEvents collapses into one observable + /// state mutation. 500 ms picks the smallest window that still feels + /// responsive on a single keystroke `touch dashboard.json` while + /// surviving v0.13's WAL-write storm. + private var pendingCoalesceTimer: DispatchWorkItem? + private var pendingTickDate: Date? + private static let coalesceWindow: TimeInterval = 0.5 + let context: ServerContext private let transport: any ServerTransport @@ -92,12 +107,32 @@ final class HermesFileWatcher { for await _ in stream { ScarfMon.event(.transport, "mac.fileWatcher.remoteDelta", count: 1) await MainActor.run { [weak self] in - self?.lastChangeDate = Date() + self?.scheduleCoalescedTick() } } } } + /// Coalesce a burst of FSEvents (or remote-poll deltas) into a single + /// `lastChangeDate` mutation after `coalesceWindow` seconds of quiet. + /// Each new fire records the latest event date and pushes the timer + /// out, so a 100-ms-spaced burst of 50 fires collapses to one observable + /// state mutation `coalesceWindow` ms after the LAST fire — same shape + /// as a debounce. Runs on `.main` (the FSEvents queue) so observers + /// see the publish on MainActor without a hop. + private func scheduleCoalescedTick() { + let now = Date() + pendingTickDate = now + pendingCoalesceTimer?.cancel() + let work = DispatchWorkItem { [weak self] in + guard let self, let date = self.pendingTickDate else { return } + self.pendingTickDate = nil + self.lastChangeDate = date + } + pendingCoalesceTimer = work + DispatchQueue.main.asyncAfter(deadline: .now() + Self.coalesceWindow, execute: work) + } + func stopWatching() { for source in coreSources + projectSources { source.cancel() @@ -108,6 +143,9 @@ final class HermesFileWatcher { timer = nil remotePollTask?.cancel() remotePollTask = nil + pendingCoalesceTimer?.cancel() + pendingCoalesceTimer = nil + pendingTickDate = nil } /// Watch each project's `dashboard.json` AND its enclosing `.scarf/` @@ -162,7 +200,7 @@ final class HermesFileWatcher { // message persisted); high counts when nothing's happening // suggest a runaway watcher install. ScarfMon.event(.transport, "mac.fileWatcher.localFire", count: 1) - self?.lastChangeDate = Date() + self?.scheduleCoalescedTick() } source.setCancelHandler { Darwin.close(fd) diff --git a/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift b/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift index fab639e..082f48d 100644 --- a/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift +++ b/scarf/scarf/Features/Dashboard/ViewModels/DashboardViewModel.swift @@ -7,6 +7,18 @@ final class DashboardViewModel { private let dataService: HermesDataService private let fileService: HermesFileService + /// Single in-flight load handle. The `.onChange(fileWatcher.lastChangeDate)` + /// observer in `DashboardView` plus `.task` on first appear can both + /// fire concurrent loads — and on v0.13 hosts the FSEvents tick rate + /// during gateway activity used to be high enough that 5+ loads + /// stacked inside 200 ms (HermesFileWatcher's coalesce window now + /// handles that, but defending here keeps the behaviour deterministic + /// on any future watcher chattiness). When a load is in flight, + /// subsequent triggers no-op; the in-flight load already has a + /// recent-enough snapshot for the user. + @ObservationIgnored + private var inFlightLoad: Task? + init(context: ServerContext = .local) { self.context = context self.dataService = HermesDataService(context: context) @@ -42,6 +54,27 @@ final class DashboardViewModel { var hermesShadows: [ProjectHermesShadowDetector.Shadow] = [] func load() async { + // Coalesce overlapping triggers: the `.task` first-appear and the + // `.onChange(fileWatcher.lastChangeDate)` observer can both fire + // a load in the same tick. Without this guard a v0.13 host's + // WAL-write storm walked over the previous load mid-snapshot + // (see HermesFileWatcher.scheduleCoalescedTick + the v2.8 dogfood + // bug report). If a load is already running, await its + // completion and return — the caller already has a fresh snapshot + // by the time `await` returns. + if let existing = inFlightLoad { + await existing.value + return + } + let task: Task = Task { @MainActor [weak self] in + await self?.loadImpl() + } + inFlightLoad = task + await task.value + inFlightLoad = nil + } + + private func loadImpl() async { isLoading = true // refresh() is essentially free for the streaming remote backend // (no transfer — every query is fresh) and a cheap reopen for