From 93a64e3e826356aab58e372c8027e357284d4e78 Mon Sep 17 00:00:00 2001
From: Alan Wizemann <alan@wizemann.com>
Date: Tue, 5 May 2026 12:17:45 +0200
Subject: [PATCH] =?UTF-8?q?fix(nous-picker):=20kill=20120s=20beach-ball=20?=
 =?UTF-8?q?=E2=80=94=20dedupe=20readCache=20+=205s=20timeout?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two stacking bugs in the Nous-overlay branch of the model picker
caused a 120-second beach-ball on remote contexts.

**Bug 1 — duplicated readCache.** ModelPickerSheet.refreshNousModels
called `service.readCache()` directly (for instant first-paint),
then called `service.loadModels(forceRefresh: false)` which calls
`readCache()` AGAIN as its first step. Two SSH round-trips per
picker open. Drop the inline call; loadModels is already cache-first
on its happy path (returns `.cache(...)` when fresh). One read
per open.

**Bug 2 — 60s readFile timeout for a hint.** `readCache()` goes
through SSHTransport.readFile which has a 60s default timeout. On
a remote with a corrupted or oversized cache file, `cat` never
returns and we wait the full 60s — twice, due to bug 1, for a
total 120s picker stall. ScarfMon perf capture (commit 00a1bbd's
diagnostic split) localized this precisely:

  nous.readCache.fileExists  =   251 ms  ✓
  nous.readCache.readFile    = 60,011 ms  ❌ (60s timeout)

Cache is an optimization, not a requirement. Added
`readCacheWithTimeout(seconds: 5)` that races readCache against
a 5-second sleep via withTaskGroup. On timeout returns nil; caller
treats that as no-cache and falls through to the network fetch
(which succeeded in 2s in the offending capture, returning 402
models). The runaway `cat` keeps running on its own 60s transport
timeout but no longer blocks the picker.

New ScarfMon event: `nous.readCache.timeoutFired` surfaces hits
in traces so we can tell whether the timeout is being exercised
in the wild.

The underlying `cat` hang on the cache file is still unexplained;
the file size (~500KB) shouldn't take 60s on a 420ms-RTT SSH link.
For now: deleting the cache file (`rm ~/.hermes/scarf/nous_models_cache.json`
on the remote) is the workaround. The next picker open will rebuild it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../Services/NousModelCatalogService.swift    | 39 ++++++++++++++++++-
 .../Views/Components/ModelPickerSheet.swift   | 22 ++++++-----
 2 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/scarf/Packages/ScarfCore/Sources/ScarfCore/Services/NousModelCatalogService.swift b/scarf/Packages/ScarfCore/Sources/ScarfCore/Services/NousModelCatalogService.swift
index 4a277b9..1bc0485 100644
--- a/scarf/Packages/ScarfCore/Sources/ScarfCore/Services/NousModelCatalogService.swift
+++ b/scarf/Packages/ScarfCore/Sources/ScarfCore/Services/NousModelCatalogService.swift
@@ -95,6 +95,33 @@ public struct NousModelCatalogService: Sendable {
     /// cache lands on the droplet, not the user's Mac). Missing or
     /// malformed cache → nil; the loader treats that as "no cache" and
     /// kicks off a fresh fetch.
+    /// Race readCache against a sleep so a hung remote `cat` doesn't
+    /// stall the picker for the full transport-level timeout (60 s).
+    /// On timeout returns nil — the caller treats that as "no usable
+    /// cache" and falls through to the network fetch.
+    public func readCacheWithTimeout(seconds: TimeInterval) async -> NousModelsCache? {
+        await withTaskGroup(of: NousModelsCache?.self) { group in
+            group.addTask { [self] in
+                // Detached because readCache is sync + does blocking
+                // SSH I/O; running on the cooperative pool is fine
+                // for one task but we don't want to fight executor
+                // scheduling with the timer task below.
+                await Task.detached { [self] in
+                    readCache()
+                }.value
+            }
+            group.addTask {
+                try? await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000))
+                ScarfMon.event(.diskIO, "nous.readCache.timeoutFired", count: 1)
+                return nil
+            }
+            // First completion wins; cancel the other.
+            let first = await group.next() ?? nil
+            group.cancelAll()
+            return first
+        }
+    }
+
     public func readCache() -> NousModelsCache? {
         ScarfMon.measure(.diskIO, "nous.readCache") {
             let transport = context.makeTransport()
@@ -222,7 +249,17 @@ public struct NousModelCatalogService: Sendable {
     /// based on the case so it can show a "could not refresh" hint
     /// next to a stale-but-still-useful list.
     public func loadModels(forceRefresh: Bool = false) async -> NousModelsLoadResult {
-        let cached = readCache()
+        // Cache-read with a short timeout. The underlying SSH `cat`
+        // can hang on a corrupted or oversized cache file (a
+        // 120-second picker stall observed in the wild — two 60 s
+        // timeouts stacked from a duplicated read; perf capture
+        // localized to `nous.readCache.readFile`). Cache is a
+        // performance hint, not a correctness requirement; if it
+        // doesn't return in 5 s, fall through to the network fetch
+        // and let writeCache rebuild it. The runaway `cat` keeps
+        // running on its own 60 s transport timeout but no longer
+        // blocks the picker.
+        let cached = await readCacheWithTimeout(seconds: 5)
 
         if let cached, !forceRefresh, !isCacheStale(cached) {
             return .cache(models: cached.models, fetchedAt: cached.fetchedAt, refreshError: nil)
diff --git a/scarf/scarf/Features/Settings/Views/Components/ModelPickerSheet.swift b/scarf/scarf/Features/Settings/Views/Components/ModelPickerSheet.swift
index 5f919d3..a2c92cd 100644
--- a/scarf/scarf/Features/Settings/Views/Components/ModelPickerSheet.swift
+++ b/scarf/scarf/Features/Settings/Views/Components/ModelPickerSheet.swift
@@ -645,15 +645,19 @@ struct ModelPickerSheet: View {
     /// network — the cache write keeps the next sheet-open instant.
     private func refreshNousModels(forceRefresh: Bool) async {
         let service = NousModelCatalogService(context: serverContext)
-        // Render from cache immediately on the first pass so the user
-        // doesn't see an empty list while the network call is in
-        // flight. The async load below overwrites with fresh data
-        // when it returns.
-        if !forceRefresh, let cache = service.readCache(), !cache.models.isEmpty, nousModels.isEmpty {
-            nousModels = cache.models
-            nousFetchedAt = cache.fetchedAt
-            nousRefreshError = nil
-        }
+        // PRE-FIX (v2.7): this used to call `service.readCache()`
+        // synchronously here for instant first-paint, then call
+        // `service.loadModels(...)` which calls `readCache()` AGAIN
+        // internally — paying the SSH round-trip TWICE per picker
+        // open. On a remote with a corrupt or oversized cache file,
+        // the duplicated reads stacked two 60-second timeouts for a
+        // 120-second picker stall. ScarfMon perf capture confirmed
+        // the duplication.
+        //
+        // loadModels() already serves cache-first on its happy path
+        // (returns `.cache(...)` when fresh), so the inline readCache
+        // here is redundant. Drop it; trust loadModels' built-in
+        // cache-first behavior. One readCache call per picker open.
         nousIsRefreshing = true
         let result = await service.loadModels(forceRefresh: forceRefresh)
         nousIsRefreshing = false