From 111fe9bb679665f6230363a97dfcc3cb581867c1 Mon Sep 17 00:00:00 2001 From: Alan Wizemann Date: Mon, 4 May 2026 14:32:06 +0200 Subject: [PATCH] feat(oauth): unblock remote re-auth + daily keepalive to prevent expiry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes for OAuth subscriptions (Nous Portal, Anthropic Claude OAuth, etc.): - **Remote re-auth stall**: Both `NousAuthFlow` and `OAuthFlowController` set `PYTHONUNBUFFERED=1` only on local contexts. On remote, setting `proc.environment` only affects the local-side ssh process — not the remote python interpreter. ssh doesn't forward arbitrary env vars without `SendEnv` configured on both sides, so remote hermes ran with default block-buffered stdout and the device-code prompt never reached Scarf — the sheet hung at "Contacting Nous Portal" forever. Fix: when remote, wrap the command in `env PYTHONUNBUFFERED=1 …` to inject the var on the remote side regardless of ssh config. - **Daily keepalive**: Hermes refreshes OAuth access tokens on agent startup but never proactively. If the user goes longer than the refresh-token lifetime (~30 days for Nous) without starting a session, the refresh token itself expires and full re-auth is required. New `OAuthKeepaliveCronService` registers a Scarf-owned daily cron job (`[scarf:oauth-keepalive] OAuth token refresh`) at 4am that runs a minimal one-token prompt — booting the session is what triggers `resolve_nous_runtime_credentials()`. Wired as an opt-in toggle in the OAuth providers section of CredentialPoolsView. When `hermes auth refresh ` lands upstream we'll swap the prompt for that verb; the surrounding wiring stays unchanged. - **Stale-refresh nudge**: `NousSubscriptionState` gains `daysSinceLastRefresh()` + `hasStaleRefresh` (>= 14 days, half of Nous's 30-day refresh-token window). The keepalive section surfaces an inline orange warning when stale and the toggle is off — points the user at the toggle that would have prevented the problem. Verification: scarfCore 263/263; Mac app builds clean. Manual repro of remote stall against Digital Ocean droplet pending user test. Co-Authored-By: Claude Opus 4.7 (1M context) --- scarf/scarf/Core/Services/NousAuthFlow.swift | 49 ++++--- .../Services/NousSubscriptionService.swift | 25 ++++ .../Services/OAuthKeepaliveCronService.swift | 121 ++++++++++++++++++ .../ViewModels/OAuthFlowController.swift | 32 +++-- .../Views/CredentialPoolsView.swift | 119 +++++++++++++++++ 5 files changed, 317 insertions(+), 29 deletions(-) create mode 100644 scarf/scarf/Core/Services/OAuthKeepaliveCronService.swift diff --git a/scarf/scarf/Core/Services/NousAuthFlow.swift b/scarf/scarf/Core/Services/NousAuthFlow.swift index b9c130e..2d8b5aa 100644 --- a/scarf/scarf/Core/Services/NousAuthFlow.swift +++ b/scarf/scarf/Core/Services/NousAuthFlow.swift @@ -62,27 +62,36 @@ final class NousAuthFlow { output = "" state = .starting - let proc = context.makeTransport().makeProcess( - executable: context.paths.hermesBinary, - args: ["auth", "add", "nous", "--no-browser"] - ) - if !context.isRemote { - // Only enrich env locally — remote ssh gets the remote login env - // naturally, and exporting our local keys into it would be wrong. + // Python block-buffers stdout when it's a pipe (not a TTY). The + // device-code flow prints the verification URL + user code, then + // enters a ~15-minute polling loop that never hits `input()` — + // so nothing flushes and our readability handler never sees the + // output. Users see the sheet spinning forever while hermes is + // actually waiting for approval. + // + // PKCE doesn't have this problem because `input("Authorization + // code: ")` flushes stdout before blocking, which is why + // OAuthFlowController works without this setting. + // + // Local: set on `proc.environment`. Remote: setting + // `proc.environment` would only configure the local-side ssh + // process, NOT the remote python interpreter — ssh doesn't + // forward arbitrary env without `SendEnv` configured on both + // sides. So for remote we wrap the command in `env + // PYTHONUNBUFFERED=1 …`, which prefixes the var into the + // remote command's environment regardless of ssh config. + let proc: Process + if context.isRemote { + proc = context.makeTransport().makeProcess( + executable: "env", + args: ["PYTHONUNBUFFERED=1", context.paths.hermesBinary, "auth", "add", "nous", "--no-browser"] + ) + } else { + proc = context.makeTransport().makeProcess( + executable: context.paths.hermesBinary, + args: ["auth", "add", "nous", "--no-browser"] + ) var env = HermesFileService.enrichedEnvironment() - // Python block-buffers stdout when it's a pipe (not a TTY). The - // device-code flow prints the verification URL + user code, then - // enters a ~15-minute polling loop that never hits `input()` — - // so nothing flushes and our readability handler never sees the - // output. Users see the sheet spinning forever while hermes is - // actually waiting for approval. - // - // PKCE doesn't have this problem because `input("Authorization - // code: ")` flushes stdout before blocking, which is why - // OAuthFlowController works without this setting. - // - // PYTHONUNBUFFERED forces line-buffered stdout for the whole - // subprocess — tiny perf cost, huge UX win for device-code. env["PYTHONUNBUFFERED"] = "1" proc.environment = env } diff --git a/scarf/scarf/Core/Services/NousSubscriptionService.swift b/scarf/scarf/Core/Services/NousSubscriptionService.swift index f7e10da..e7812a3 100644 --- a/scarf/scarf/Core/Services/NousSubscriptionService.swift +++ b/scarf/scarf/Core/Services/NousSubscriptionService.swift @@ -25,6 +25,31 @@ struct NousSubscriptionState: Sendable, Hashable { /// to line up: auth record present *and* `nous` is the active provider. /// Mirrors `NousSubscriptionFeatures.subscribed` on the Python side. var subscribed: Bool { present && providerIsNous } + + /// Days since the auth record was last touched (refreshed by Hermes + /// or re-authed by the user). Hermes refreshes on every agent boot, + /// so a large value here means the user hasn't started a session + /// recently — which is exactly when the refresh token is at risk + /// of expiring (typical ~30 day lifetime). Returns nil when + /// `updatedAt` is unknown (older Hermes versions). Capped at + /// `Int.max` to avoid overflow on absurd inputs. + func daysSinceLastRefresh(now: Date = Date()) -> Int? { + guard let updatedAt else { return nil } + let seconds = now.timeIntervalSince(updatedAt) + guard seconds > 0 else { return 0 } + return Int(seconds / 86_400) + } + + /// True when we haven't seen a Hermes refresh in ≥14 days — half + /// the typical 30-day Nous refresh-token lifetime. This is the + /// trigger for the "enable keepalive" nudge: still recoverable + /// (refresh token hasn't expired yet) but heading there. Returns + /// false when `updatedAt` is unknown — we don't nudge on missing + /// data, only on confirmed staleness. + var hasStaleRefresh: Bool { + guard let days = daysSinceLastRefresh() else { return false } + return days >= 14 + } } /// Reads `auth.json` to detect Nous Portal subscription state. Delegates file diff --git a/scarf/scarf/Core/Services/OAuthKeepaliveCronService.swift b/scarf/scarf/Core/Services/OAuthKeepaliveCronService.swift new file mode 100644 index 0000000..9b18be4 --- /dev/null +++ b/scarf/scarf/Core/Services/OAuthKeepaliveCronService.swift @@ -0,0 +1,121 @@ +import Foundation +import ScarfCore +import os + +/// Manages a Scarf-owned cron job that keeps OAuth refresh tokens +/// alive by booting a trivial Hermes session on a daily cadence. +/// +/// **Why this exists.** Hermes refreshes OAuth access tokens on +/// agent startup (via `resolve_nous_runtime_credentials()` and +/// equivalents), but never proactively. If the user goes longer than +/// the *refresh*-token lifetime without starting a session, the +/// refresh token itself expires and only a full re-auth recovers it. +/// Refresh-token lifetimes are typically ~30 days; a 24-hour +/// heartbeat keeps the window from closing for users who go quiet. +/// +/// **What it runs.** A single cron job with a stable name +/// (`Self.jobName`) and a minimal one-token prompt. Executing the +/// job boots `hermes acp` end-to-end, which is what triggers the +/// refresh. There is no public Hermes CLI verb to refresh a token in +/// isolation today (no `hermes auth refresh `), so booting +/// a session is the only mechanism we have. When Hermes adds a +/// dedicated refresh verb, swap the prompt for a `--script` that +/// invokes it and the surrounding wiring stays unchanged. +/// +/// **Identification.** The job is found by exact-match on +/// `Self.jobName`. Users can edit the schedule from the Cron tab +/// without breaking detection — only the name is load-bearing here. +@MainActor +final class OAuthKeepaliveCronService { + /// Stable job name. The leading `[scarf:oauth-keepalive]` prefix + /// follows the convention `ProjectTemplateInstaller` uses for + /// template-installed cron jobs (`[tmpl:] …`) so future + /// inspection tools can distinguish Scarf-owned schedules from + /// user-authored ones at a glance. + static let jobName = "[scarf:oauth-keepalive] OAuth token refresh" + + /// 4am local daily. Off-peak avoids contending with interactive + /// usage and is a reasonable default; users can reschedule from + /// the Cron tab if they prefer a different cadence. The cron + /// window must stay <= the shortest refresh-token lifetime among + /// the user's configured OAuth providers (~30d for Nous). + static let defaultSchedule = "0 4 * * *" + + /// Minimal prompt. The point is to boot a session — not to do + /// useful work — so we want the LLM call to terminate fast. A + /// one-word prompt + a one-word reply is the cheapest end-to-end + /// turn. Subscription-routed providers (Nous) bear zero + /// per-call cost; for API-key users, a single trivial turn per + /// day is negligible compared to the alternative of full re-auth + /// every month. + static let defaultPrompt = "Reply with the single word 'ok'." + + private let logger = Logger(subsystem: "com.scarf", category: "OAuthKeepaliveCronService") + let context: ServerContext + private let fileService: HermesFileService + + init(context: ServerContext = .local) { + self.context = context + self.fileService = HermesFileService(context: context) + } + + // MARK: - Read + + /// Returns the keepalive job if one is currently registered, nil + /// otherwise. Reads `~/.hermes/cron/jobs.json` synchronously via + /// the existing `loadCronJobs()` path. + nonisolated func currentJob() -> HermesCronJob? { + fileService.loadCronJobs().first { $0.name == Self.jobName } + } + + nonisolated func isEnabled() -> Bool { + currentJob() != nil + } + + // MARK: - Mutate + + /// Register the keepalive job via `hermes cron create`. No-op when + /// a job with the same name already exists — toggle semantics + /// stay idempotent so a double-tap doesn't duplicate the entry. + /// Returns true on success or no-op, false on CLI failure. + @discardableResult + nonisolated func enable() async -> Bool { + if isEnabled() { return true } + let result = await Task.detached { [fileService] in + fileService.runHermesCLI( + args: [ + "cron", "create", + "--name", Self.jobName, + "--silent", + Self.defaultSchedule, + Self.defaultPrompt, + ], + timeout: 60 + ) + }.value + if result.exitCode != 0 { + logger.warning("oauth-keepalive enable failed: exit=\(result.exitCode) output=\(result.output, privacy: .public)") + return false + } + return true + } + + /// Remove the keepalive job. Idempotent — when no job exists + /// today, the call is a no-op success. Returns true on success + /// or no-op, false on CLI failure. + @discardableResult + nonisolated func disable() async -> Bool { + guard let job = currentJob() else { return true } + let result = await Task.detached { [fileService] in + fileService.runHermesCLI( + args: ["cron", "remove", job.id], + timeout: 30 + ) + }.value + if result.exitCode != 0 { + logger.warning("oauth-keepalive disable failed: exit=\(result.exitCode) output=\(result.output, privacy: .public)") + return false + } + return true + } +} diff --git a/scarf/scarf/Features/CredentialPools/ViewModels/OAuthFlowController.swift b/scarf/scarf/Features/CredentialPools/ViewModels/OAuthFlowController.swift index 089c714..66a075a 100644 --- a/scarf/scarf/Features/CredentialPools/ViewModels/OAuthFlowController.swift +++ b/scarf/scarf/Features/CredentialPools/ViewModels/OAuthFlowController.swift @@ -93,15 +93,29 @@ final class OAuthFlowController { // local spawns hermes directly, remote rounds through ssh -T while // preserving stdin (for the auth-code prompt) and stdout (for the // URL parser). - let proc = context.makeTransport().makeProcess( - executable: context.paths.hermesBinary, - args: args - ) - if !context.isRemote { - // Only enrich env locally — the remote ssh process gets the - // remote login env naturally, and exporting our local API keys - // into it would be wrong. - proc.environment = HermesFileService.enrichedEnvironment() + // + // PYTHONUNBUFFERED forces line-buffered Python stdout so the URL + // banner reaches us before `input("Authorization code: ")` + // blocks. PKCE *usually* recovers because input() flushes, but + // certain providers print preamble lines AFTER the prompt that + // we still want streamed in real time. Local: set on + // `proc.environment`. Remote: ssh doesn't forward arbitrary env + // vars without `SendEnv` configured, so wrap the command in + // `env PYTHONUNBUFFERED=1 …` to inject it on the remote side. + let proc: Process + if context.isRemote { + proc = context.makeTransport().makeProcess( + executable: "env", + args: ["PYTHONUNBUFFERED=1", context.paths.hermesBinary] + args + ) + } else { + proc = context.makeTransport().makeProcess( + executable: context.paths.hermesBinary, + args: args + ) + var env = HermesFileService.enrichedEnvironment() + env["PYTHONUNBUFFERED"] = "1" + proc.environment = env } let outPipe = Pipe() diff --git a/scarf/scarf/Features/CredentialPools/Views/CredentialPoolsView.swift b/scarf/scarf/Features/CredentialPools/Views/CredentialPoolsView.swift index 6b582db..c0ac0d0 100644 --- a/scarf/scarf/Features/CredentialPools/Views/CredentialPoolsView.swift +++ b/scarf/scarf/Features/CredentialPools/Views/CredentialPoolsView.swift @@ -15,8 +15,27 @@ struct CredentialPoolsView: View { @State private var reauthInitialProvider: String? @Environment(AppCoordinator.self) private var coordinator + /// Mirror of `OAuthKeepaliveCronService.isEnabled()` so the + /// toggle reads from local @State (instant) instead of hitting + /// disk on every render. `nil` while the initial probe is in + /// flight; reloaded on appear and after every enable/disable. + @State private var keepaliveEnabled: Bool? + @State private var keepaliveBusy: Bool = false + @State private var keepaliveError: String? + /// Cached Nous subscription state. Used by `keepaliveSection` to + /// surface a contextual nudge when the auth record hasn't been + /// refreshed in ≥14 days — that's exactly when enabling the + /// keepalive cron is highest-value. Loaded async on appear; the + /// section renders without the nudge while this is `.absent`. + @State private var nousSubscription: NousSubscriptionState = .absent + + private let keepalive: OAuthKeepaliveCronService + private let nousService: NousSubscriptionService + init(context: ServerContext) { _viewModel = State(initialValue: CredentialPoolsViewModel(context: context)) + self.keepalive = OAuthKeepaliveCronService(context: context) + self.nousService = NousSubscriptionService(context: context) } @@ -32,6 +51,7 @@ struct CredentialPoolsView: View { emptyState } else { if !viewModel.oauthProviders.isEmpty { + keepaliveSection oauthProvidersSection } ForEach(viewModel.pools) { pool in @@ -53,6 +73,7 @@ struct CredentialPoolsView: View { .onAppear { viewModel.load() consumePendingReauth() + probeKeepalive() } .onChange(of: coordinator.pendingOAuthReauth) { _, _ in consumePendingReauth() @@ -91,6 +112,104 @@ struct CredentialPoolsView: View { coordinator.pendingOAuthReauth = nil } + /// Read the current keepalive cron job state off the main + /// thread. Disk reads on remote contexts can take 100–300ms + /// (one SFTP round-trip for `~/.hermes/cron/jobs.json`) so this + /// hops to a detached task and only flips `keepaliveEnabled` on + /// MainActor when the result lands. Concurrently loads the Nous + /// subscription record so the staleness nudge is computed off + /// the same probe. + private func probeKeepalive() { + let svc = keepalive + let nous = nousService + Task.detached { + let enabled = svc.isEnabled() + let state = nous.loadState() + await MainActor.run { + keepaliveEnabled = enabled + nousSubscription = state + } + } + } + + /// Section above the OAuth providers list with a single toggle + /// that registers / removes a Scarf-owned daily cron job. The + /// job's only purpose is to boot a Hermes session, which is what + /// causes Hermes to refresh OAuth access tokens (no standalone + /// CLI verb for refresh exists today). Hidden until we know the + /// current state — flickering the toggle off→on on view appear + /// would be confusing. + @ViewBuilder + private var keepaliveSection: some View { + let isOn = keepaliveEnabled ?? false + let stale = nousSubscription.hasStaleRefresh && keepaliveEnabled == false + SettingsSection(title: LocalizedStringKey("Keep tokens fresh"), icon: "arrow.clockwise") { + HStack(alignment: .top, spacing: 12) { + Image(systemName: "arrow.clockwise.circle") + .foregroundStyle(.secondary) + VStack(alignment: .leading, spacing: 4) { + Toggle(isOn: Binding( + get: { isOn }, + set: { newValue in toggleKeepalive(to: newValue) } + )) { + Text("Auto-refresh OAuth tokens daily") + .font(.system(.body, weight: .medium)) + } + .toggleStyle(.switch) + .disabled(keepaliveEnabled == nil || keepaliveBusy) + Text("Registers a `\(OAuthKeepaliveCronService.jobName)` cron job that runs at 4am daily. Booting a Hermes session is what triggers token refresh — without this, refresh tokens silently expire if you go ~30 days without using Scarf.") + .font(.caption) + .foregroundStyle(.secondary) + .fixedSize(horizontal: false, vertical: true) + if stale, let days = nousSubscription.daysSinceLastRefresh() { + HStack(spacing: 6) { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundStyle(.orange) + Text("Your Nous subscription was last refreshed \(days) days ago. Enable the toggle above to prevent the refresh token from expiring.") + .font(.caption) + .foregroundStyle(.orange) + .fixedSize(horizontal: false, vertical: true) + } + .padding(.top, 4) + } + if let err = keepaliveError { + Text(err) + .font(.caption) + .foregroundStyle(.red) + .textSelection(.enabled) + } + } + Spacer(minLength: 0) + if keepaliveBusy { + ProgressView().controlSize(.small) + } + } + .padding(.horizontal, 12) + .padding(.vertical, 8) + .background(.quaternary.opacity(0.3)) + } + } + + private func toggleKeepalive(to newValue: Bool) { + guard !keepaliveBusy else { return } + keepaliveBusy = true + keepaliveError = nil + let svc = keepalive + Task.detached { + let ok = newValue ? await svc.enable() : await svc.disable() + let actualState = svc.isEnabled() + await MainActor.run { + keepaliveBusy = false + keepaliveEnabled = actualState + if !ok { + keepaliveError = newValue + ? "Couldn't register the keepalive cron job. Check `hermes cron` works in a terminal." + : "Couldn't remove the keepalive cron job. Check `hermes cron remove` works in a terminal." + } + } + } + } + private var header: some View { ScarfPageHeader( "Credential Pools",