From 93a64e3e826356aab58e372c8027e357284d4e78 Mon Sep 17 00:00:00 2001 From: Alan Wizemann Date: Tue, 5 May 2026 12:17:45 +0200 Subject: [PATCH] =?UTF-8?q?fix(nous-picker):=20kill=20120s=20beach-ball=20?= =?UTF-8?q?=E2=80=94=20dedupe=20readCache=20+=205s=20timeout?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two stacking bugs in the Nous-overlay branch of the model picker caused a 120-second beach-ball on remote contexts. **Bug 1 — duplicated readCache.** ModelPickerSheet.refreshNousModels called `service.readCache()` directly (for instant first-paint), then called `service.loadModels(forceRefresh: false)` which calls `readCache()` AGAIN as its first step. Two SSH round-trips per picker open. Drop the inline call; loadModels is already cache-first on its happy path (returns `.cache(...)` when fresh). One read per open. **Bug 2 — 60s readFile timeout for a hint.** `readCache()` goes through SSHTransport.readFile which has a 60s default timeout. On a remote with a corrupted or oversized cache file, `cat` never returns and we wait the full 60s — twice, due to bug 1, for a total 120s picker stall. ScarfMon perf capture (commit 00a1bbd's diagnostic split) localized this precisely: nous.readCache.fileExists = 251 ms ✓ nous.readCache.readFile = 60,011 ms ❌ (60s timeout) Cache is an optimization, not a requirement. Added `readCacheWithTimeout(seconds: 5)` that races readCache against a 5-second sleep via withTaskGroup. On timeout returns nil; caller treats that as no-cache and falls through to the network fetch (which succeeded in 2s in the offending capture, returning 402 models). The runaway `cat` keeps running on its own 60s transport timeout but no longer blocks the picker. New ScarfMon event: `nous.readCache.timeoutFired` surfaces hits in traces so we can tell whether the timeout is being exercised in the wild. The underlying `cat` hang on the cache file is still unexplained; the file size (~500KB) shouldn't take 60s on a 420ms-RTT SSH link. For now: deleting the cache file (`rm ~/.hermes/scarf/nous_models_cache.json` on the remote) is the workaround. The next picker open will rebuild it. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Services/NousModelCatalogService.swift | 39 ++++++++++++++++++- .../Views/Components/ModelPickerSheet.swift | 22 ++++++----- 2 files changed, 51 insertions(+), 10 deletions(-) diff --git a/scarf/Packages/ScarfCore/Sources/ScarfCore/Services/NousModelCatalogService.swift b/scarf/Packages/ScarfCore/Sources/ScarfCore/Services/NousModelCatalogService.swift index 4a277b9..1bc0485 100644 --- a/scarf/Packages/ScarfCore/Sources/ScarfCore/Services/NousModelCatalogService.swift +++ b/scarf/Packages/ScarfCore/Sources/ScarfCore/Services/NousModelCatalogService.swift @@ -95,6 +95,33 @@ public struct NousModelCatalogService: Sendable { /// cache lands on the droplet, not the user's Mac). Missing or /// malformed cache → nil; the loader treats that as "no cache" and /// kicks off a fresh fetch. + /// Race readCache against a sleep so a hung remote `cat` doesn't + /// stall the picker for the full transport-level timeout (60 s). + /// On timeout returns nil — the caller treats that as "no usable + /// cache" and falls through to the network fetch. + public func readCacheWithTimeout(seconds: TimeInterval) async -> NousModelsCache? { + await withTaskGroup(of: NousModelsCache?.self) { group in + group.addTask { [self] in + // Detached because readCache is sync + does blocking + // SSH I/O; running on the cooperative pool is fine + // for one task but we don't want to fight executor + // scheduling with the timer task below. + await Task.detached { [self] in + readCache() + }.value + } + group.addTask { + try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000)) + ScarfMon.event(.diskIO, "nous.readCache.timeoutFired", count: 1) + return nil + } + // First completion wins; cancel the other. + let first = await group.next() ?? nil + group.cancelAll() + return first + } + } + public func readCache() -> NousModelsCache? { ScarfMon.measure(.diskIO, "nous.readCache") { let transport = context.makeTransport() @@ -222,7 +249,17 @@ public struct NousModelCatalogService: Sendable { /// based on the case so it can show a "could not refresh" hint /// next to a stale-but-still-useful list. public func loadModels(forceRefresh: Bool = false) async -> NousModelsLoadResult { - let cached = readCache() + // Cache-read with a short timeout. The underlying SSH `cat` + // can hang on a corrupted or oversized cache file (a + // 120-second picker stall observed in the wild — two 60 s + // timeouts stacked from a duplicated read; perf capture + // localized to `nous.readCache.readFile`). Cache is a + // performance hint, not a correctness requirement; if it + // doesn't return in 5 s, fall through to the network fetch + // and let writeCache rebuild it. The runaway `cat` keeps + // running on its own 60 s transport timeout but no longer + // blocks the picker. + let cached = await readCacheWithTimeout(seconds: 5) if let cached, !forceRefresh, !isCacheStale(cached) { return .cache(models: cached.models, fetchedAt: cached.fetchedAt, refreshError: nil) diff --git a/scarf/scarf/Features/Settings/Views/Components/ModelPickerSheet.swift b/scarf/scarf/Features/Settings/Views/Components/ModelPickerSheet.swift index 5f919d3..a2c92cd 100644 --- a/scarf/scarf/Features/Settings/Views/Components/ModelPickerSheet.swift +++ b/scarf/scarf/Features/Settings/Views/Components/ModelPickerSheet.swift @@ -645,15 +645,19 @@ struct ModelPickerSheet: View { /// network — the cache write keeps the next sheet-open instant. private func refreshNousModels(forceRefresh: Bool) async { let service = NousModelCatalogService(context: serverContext) - // Render from cache immediately on the first pass so the user - // doesn't see an empty list while the network call is in - // flight. The async load below overwrites with fresh data - // when it returns. - if !forceRefresh, let cache = service.readCache(), !cache.models.isEmpty, nousModels.isEmpty { - nousModels = cache.models - nousFetchedAt = cache.fetchedAt - nousRefreshError = nil - } + // PRE-FIX (v2.7): this used to call `service.readCache()` + // synchronously here for instant first-paint, then call + // `service.loadModels(...)` which calls `readCache()` AGAIN + // internally — paying the SSH round-trip TWICE per picker + // open. On a remote with a corrupt or oversized cache file, + // the duplicated reads stacked two 60-second timeouts for a + // 120-second picker stall. ScarfMon perf capture confirmed + // the duplication. + // + // loadModels() already serves cache-first on its happy path + // (returns `.cache(...)` when fresh), so the inline readCache + // here is redundant. Drop it; trust loadModels' built-in + // cache-first behavior. One readCache call per picker open. nousIsRefreshing = true let result = await service.loadModels(forceRefresh: forceRefresh) nousIsRefreshing = false