mirror of
https://github.com/awizemann/scarf.git
synced 2026-05-08 02:14:37 +00:00
feat(oauth): unblock remote re-auth + daily keepalive to prevent expiry
Two related fixes for OAuth subscriptions (Nous Portal, Anthropic Claude OAuth, etc.): - **Remote re-auth stall**: Both `NousAuthFlow` and `OAuthFlowController` set `PYTHONUNBUFFERED=1` only on local contexts. On remote, setting `proc.environment` only affects the local-side ssh process — not the remote python interpreter. ssh doesn't forward arbitrary env vars without `SendEnv` configured on both sides, so remote hermes ran with default block-buffered stdout and the device-code prompt never reached Scarf — the sheet hung at "Contacting Nous Portal" forever. Fix: when remote, wrap the command in `env PYTHONUNBUFFERED=1 …` to inject the var on the remote side regardless of ssh config. - **Daily keepalive**: Hermes refreshes OAuth access tokens on agent startup but never proactively. If the user goes longer than the refresh-token lifetime (~30 days for Nous) without starting a session, the refresh token itself expires and full re-auth is required. New `OAuthKeepaliveCronService` registers a Scarf-owned daily cron job (`[scarf:oauth-keepalive] OAuth token refresh`) at 4am that runs a minimal one-token prompt — booting the session is what triggers `resolve_nous_runtime_credentials()`. Wired as an opt-in toggle in the OAuth providers section of CredentialPoolsView. When `hermes auth refresh <provider>` lands upstream we'll swap the prompt for that verb; the surrounding wiring stays unchanged. - **Stale-refresh nudge**: `NousSubscriptionState` gains `daysSinceLastRefresh()` + `hasStaleRefresh` (>= 14 days, half of Nous's 30-day refresh-token window). The keepalive section surfaces an inline orange warning when stale and the toggle is off — points the user at the toggle that would have prevented the problem. Verification: scarfCore 263/263; Mac app builds clean. Manual repro of remote stall against Digital Ocean droplet pending user test. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -62,27 +62,36 @@ final class NousAuthFlow {
|
||||
output = ""
|
||||
state = .starting
|
||||
|
||||
let proc = context.makeTransport().makeProcess(
|
||||
executable: context.paths.hermesBinary,
|
||||
args: ["auth", "add", "nous", "--no-browser"]
|
||||
)
|
||||
if !context.isRemote {
|
||||
// Only enrich env locally — remote ssh gets the remote login env
|
||||
// naturally, and exporting our local keys into it would be wrong.
|
||||
// Python block-buffers stdout when it's a pipe (not a TTY). The
|
||||
// device-code flow prints the verification URL + user code, then
|
||||
// enters a ~15-minute polling loop that never hits `input()` —
|
||||
// so nothing flushes and our readability handler never sees the
|
||||
// output. Users see the sheet spinning forever while hermes is
|
||||
// actually waiting for approval.
|
||||
//
|
||||
// PKCE doesn't have this problem because `input("Authorization
|
||||
// code: ")` flushes stdout before blocking, which is why
|
||||
// OAuthFlowController works without this setting.
|
||||
//
|
||||
// Local: set on `proc.environment`. Remote: setting
|
||||
// `proc.environment` would only configure the local-side ssh
|
||||
// process, NOT the remote python interpreter — ssh doesn't
|
||||
// forward arbitrary env without `SendEnv` configured on both
|
||||
// sides. So for remote we wrap the command in `env
|
||||
// PYTHONUNBUFFERED=1 …`, which prefixes the var into the
|
||||
// remote command's environment regardless of ssh config.
|
||||
let proc: Process
|
||||
if context.isRemote {
|
||||
proc = context.makeTransport().makeProcess(
|
||||
executable: "env",
|
||||
args: ["PYTHONUNBUFFERED=1", context.paths.hermesBinary, "auth", "add", "nous", "--no-browser"]
|
||||
)
|
||||
} else {
|
||||
proc = context.makeTransport().makeProcess(
|
||||
executable: context.paths.hermesBinary,
|
||||
args: ["auth", "add", "nous", "--no-browser"]
|
||||
)
|
||||
var env = HermesFileService.enrichedEnvironment()
|
||||
// Python block-buffers stdout when it's a pipe (not a TTY). The
|
||||
// device-code flow prints the verification URL + user code, then
|
||||
// enters a ~15-minute polling loop that never hits `input()` —
|
||||
// so nothing flushes and our readability handler never sees the
|
||||
// output. Users see the sheet spinning forever while hermes is
|
||||
// actually waiting for approval.
|
||||
//
|
||||
// PKCE doesn't have this problem because `input("Authorization
|
||||
// code: ")` flushes stdout before blocking, which is why
|
||||
// OAuthFlowController works without this setting.
|
||||
//
|
||||
// PYTHONUNBUFFERED forces line-buffered stdout for the whole
|
||||
// subprocess — tiny perf cost, huge UX win for device-code.
|
||||
env["PYTHONUNBUFFERED"] = "1"
|
||||
proc.environment = env
|
||||
}
|
||||
|
||||
@@ -25,6 +25,31 @@ struct NousSubscriptionState: Sendable, Hashable {
|
||||
/// to line up: auth record present *and* `nous` is the active provider.
|
||||
/// Mirrors `NousSubscriptionFeatures.subscribed` on the Python side.
|
||||
var subscribed: Bool { present && providerIsNous }
|
||||
|
||||
/// Days since the auth record was last touched (refreshed by Hermes
|
||||
/// or re-authed by the user). Hermes refreshes on every agent boot,
|
||||
/// so a large value here means the user hasn't started a session
|
||||
/// recently — which is exactly when the refresh token is at risk
|
||||
/// of expiring (typical ~30 day lifetime). Returns nil when
|
||||
/// `updatedAt` is unknown (older Hermes versions). Capped at
|
||||
/// `Int.max` to avoid overflow on absurd inputs.
|
||||
func daysSinceLastRefresh(now: Date = Date()) -> Int? {
|
||||
guard let updatedAt else { return nil }
|
||||
let seconds = now.timeIntervalSince(updatedAt)
|
||||
guard seconds > 0 else { return 0 }
|
||||
return Int(seconds / 86_400)
|
||||
}
|
||||
|
||||
/// True when we haven't seen a Hermes refresh in ≥14 days — half
|
||||
/// the typical 30-day Nous refresh-token lifetime. This is the
|
||||
/// trigger for the "enable keepalive" nudge: still recoverable
|
||||
/// (refresh token hasn't expired yet) but heading there. Returns
|
||||
/// false when `updatedAt` is unknown — we don't nudge on missing
|
||||
/// data, only on confirmed staleness.
|
||||
var hasStaleRefresh: Bool {
|
||||
guard let days = daysSinceLastRefresh() else { return false }
|
||||
return days >= 14
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads `auth.json` to detect Nous Portal subscription state. Delegates file
|
||||
|
||||
@@ -0,0 +1,121 @@
|
||||
import Foundation
|
||||
import ScarfCore
|
||||
import os
|
||||
|
||||
/// Manages a Scarf-owned cron job that keeps OAuth refresh tokens
|
||||
/// alive by booting a trivial Hermes session on a daily cadence.
|
||||
///
|
||||
/// **Why this exists.** Hermes refreshes OAuth access tokens on
|
||||
/// agent startup (via `resolve_nous_runtime_credentials()` and
|
||||
/// equivalents), but never proactively. If the user goes longer than
|
||||
/// the *refresh*-token lifetime without starting a session, the
|
||||
/// refresh token itself expires and only a full re-auth recovers it.
|
||||
/// Refresh-token lifetimes are typically ~30 days; a 24-hour
|
||||
/// heartbeat keeps the window from closing for users who go quiet.
|
||||
///
|
||||
/// **What it runs.** A single cron job with a stable name
|
||||
/// (`Self.jobName`) and a minimal one-token prompt. Executing the
|
||||
/// job boots `hermes acp` end-to-end, which is what triggers the
|
||||
/// refresh. There is no public Hermes CLI verb to refresh a token in
|
||||
/// isolation today (no `hermes auth refresh <provider>`), so booting
|
||||
/// a session is the only mechanism we have. When Hermes adds a
|
||||
/// dedicated refresh verb, swap the prompt for a `--script` that
|
||||
/// invokes it and the surrounding wiring stays unchanged.
|
||||
///
|
||||
/// **Identification.** The job is found by exact-match on
|
||||
/// `Self.jobName`. Users can edit the schedule from the Cron tab
|
||||
/// without breaking detection — only the name is load-bearing here.
|
||||
@MainActor
|
||||
final class OAuthKeepaliveCronService {
|
||||
/// Stable job name. The leading `[scarf:oauth-keepalive]` prefix
|
||||
/// follows the convention `ProjectTemplateInstaller` uses for
|
||||
/// template-installed cron jobs (`[tmpl:<id>] …`) so future
|
||||
/// inspection tools can distinguish Scarf-owned schedules from
|
||||
/// user-authored ones at a glance.
|
||||
static let jobName = "[scarf:oauth-keepalive] OAuth token refresh"
|
||||
|
||||
/// 4am local daily. Off-peak avoids contending with interactive
|
||||
/// usage and is a reasonable default; users can reschedule from
|
||||
/// the Cron tab if they prefer a different cadence. The cron
|
||||
/// window must stay <= the shortest refresh-token lifetime among
|
||||
/// the user's configured OAuth providers (~30d for Nous).
|
||||
static let defaultSchedule = "0 4 * * *"
|
||||
|
||||
/// Minimal prompt. The point is to boot a session — not to do
|
||||
/// useful work — so we want the LLM call to terminate fast. A
|
||||
/// one-word prompt + a one-word reply is the cheapest end-to-end
|
||||
/// turn. Subscription-routed providers (Nous) bear zero
|
||||
/// per-call cost; for API-key users, a single trivial turn per
|
||||
/// day is negligible compared to the alternative of full re-auth
|
||||
/// every month.
|
||||
static let defaultPrompt = "Reply with the single word 'ok'."
|
||||
|
||||
private let logger = Logger(subsystem: "com.scarf", category: "OAuthKeepaliveCronService")
|
||||
let context: ServerContext
|
||||
private let fileService: HermesFileService
|
||||
|
||||
init(context: ServerContext = .local) {
|
||||
self.context = context
|
||||
self.fileService = HermesFileService(context: context)
|
||||
}
|
||||
|
||||
// MARK: - Read
|
||||
|
||||
/// Returns the keepalive job if one is currently registered, nil
|
||||
/// otherwise. Reads `~/.hermes/cron/jobs.json` synchronously via
|
||||
/// the existing `loadCronJobs()` path.
|
||||
nonisolated func currentJob() -> HermesCronJob? {
|
||||
fileService.loadCronJobs().first { $0.name == Self.jobName }
|
||||
}
|
||||
|
||||
nonisolated func isEnabled() -> Bool {
|
||||
currentJob() != nil
|
||||
}
|
||||
|
||||
// MARK: - Mutate
|
||||
|
||||
/// Register the keepalive job via `hermes cron create`. No-op when
|
||||
/// a job with the same name already exists — toggle semantics
|
||||
/// stay idempotent so a double-tap doesn't duplicate the entry.
|
||||
/// Returns true on success or no-op, false on CLI failure.
|
||||
@discardableResult
|
||||
nonisolated func enable() async -> Bool {
|
||||
if isEnabled() { return true }
|
||||
let result = await Task.detached { [fileService] in
|
||||
fileService.runHermesCLI(
|
||||
args: [
|
||||
"cron", "create",
|
||||
"--name", Self.jobName,
|
||||
"--silent",
|
||||
Self.defaultSchedule,
|
||||
Self.defaultPrompt,
|
||||
],
|
||||
timeout: 60
|
||||
)
|
||||
}.value
|
||||
if result.exitCode != 0 {
|
||||
logger.warning("oauth-keepalive enable failed: exit=\(result.exitCode) output=\(result.output, privacy: .public)")
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
/// Remove the keepalive job. Idempotent — when no job exists
|
||||
/// today, the call is a no-op success. Returns true on success
|
||||
/// or no-op, false on CLI failure.
|
||||
@discardableResult
|
||||
nonisolated func disable() async -> Bool {
|
||||
guard let job = currentJob() else { return true }
|
||||
let result = await Task.detached { [fileService] in
|
||||
fileService.runHermesCLI(
|
||||
args: ["cron", "remove", job.id],
|
||||
timeout: 30
|
||||
)
|
||||
}.value
|
||||
if result.exitCode != 0 {
|
||||
logger.warning("oauth-keepalive disable failed: exit=\(result.exitCode) output=\(result.output, privacy: .public)")
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
@@ -93,15 +93,29 @@ final class OAuthFlowController {
|
||||
// local spawns hermes directly, remote rounds through ssh -T while
|
||||
// preserving stdin (for the auth-code prompt) and stdout (for the
|
||||
// URL parser).
|
||||
let proc = context.makeTransport().makeProcess(
|
||||
executable: context.paths.hermesBinary,
|
||||
args: args
|
||||
)
|
||||
if !context.isRemote {
|
||||
// Only enrich env locally — the remote ssh process gets the
|
||||
// remote login env naturally, and exporting our local API keys
|
||||
// into it would be wrong.
|
||||
proc.environment = HermesFileService.enrichedEnvironment()
|
||||
//
|
||||
// PYTHONUNBUFFERED forces line-buffered Python stdout so the URL
|
||||
// banner reaches us before `input("Authorization code: ")`
|
||||
// blocks. PKCE *usually* recovers because input() flushes, but
|
||||
// certain providers print preamble lines AFTER the prompt that
|
||||
// we still want streamed in real time. Local: set on
|
||||
// `proc.environment`. Remote: ssh doesn't forward arbitrary env
|
||||
// vars without `SendEnv` configured, so wrap the command in
|
||||
// `env PYTHONUNBUFFERED=1 …` to inject it on the remote side.
|
||||
let proc: Process
|
||||
if context.isRemote {
|
||||
proc = context.makeTransport().makeProcess(
|
||||
executable: "env",
|
||||
args: ["PYTHONUNBUFFERED=1", context.paths.hermesBinary] + args
|
||||
)
|
||||
} else {
|
||||
proc = context.makeTransport().makeProcess(
|
||||
executable: context.paths.hermesBinary,
|
||||
args: args
|
||||
)
|
||||
var env = HermesFileService.enrichedEnvironment()
|
||||
env["PYTHONUNBUFFERED"] = "1"
|
||||
proc.environment = env
|
||||
}
|
||||
|
||||
let outPipe = Pipe()
|
||||
|
||||
@@ -15,8 +15,27 @@ struct CredentialPoolsView: View {
|
||||
@State private var reauthInitialProvider: String?
|
||||
@Environment(AppCoordinator.self) private var coordinator
|
||||
|
||||
/// Mirror of `OAuthKeepaliveCronService.isEnabled()` so the
|
||||
/// toggle reads from local @State (instant) instead of hitting
|
||||
/// disk on every render. `nil` while the initial probe is in
|
||||
/// flight; reloaded on appear and after every enable/disable.
|
||||
@State private var keepaliveEnabled: Bool?
|
||||
@State private var keepaliveBusy: Bool = false
|
||||
@State private var keepaliveError: String?
|
||||
/// Cached Nous subscription state. Used by `keepaliveSection` to
|
||||
/// surface a contextual nudge when the auth record hasn't been
|
||||
/// refreshed in ≥14 days — that's exactly when enabling the
|
||||
/// keepalive cron is highest-value. Loaded async on appear; the
|
||||
/// section renders without the nudge while this is `.absent`.
|
||||
@State private var nousSubscription: NousSubscriptionState = .absent
|
||||
|
||||
private let keepalive: OAuthKeepaliveCronService
|
||||
private let nousService: NousSubscriptionService
|
||||
|
||||
init(context: ServerContext) {
|
||||
_viewModel = State(initialValue: CredentialPoolsViewModel(context: context))
|
||||
self.keepalive = OAuthKeepaliveCronService(context: context)
|
||||
self.nousService = NousSubscriptionService(context: context)
|
||||
}
|
||||
|
||||
|
||||
@@ -32,6 +51,7 @@ struct CredentialPoolsView: View {
|
||||
emptyState
|
||||
} else {
|
||||
if !viewModel.oauthProviders.isEmpty {
|
||||
keepaliveSection
|
||||
oauthProvidersSection
|
||||
}
|
||||
ForEach(viewModel.pools) { pool in
|
||||
@@ -53,6 +73,7 @@ struct CredentialPoolsView: View {
|
||||
.onAppear {
|
||||
viewModel.load()
|
||||
consumePendingReauth()
|
||||
probeKeepalive()
|
||||
}
|
||||
.onChange(of: coordinator.pendingOAuthReauth) { _, _ in
|
||||
consumePendingReauth()
|
||||
@@ -91,6 +112,104 @@ struct CredentialPoolsView: View {
|
||||
coordinator.pendingOAuthReauth = nil
|
||||
}
|
||||
|
||||
/// Read the current keepalive cron job state off the main
|
||||
/// thread. Disk reads on remote contexts can take 100–300ms
|
||||
/// (one SFTP round-trip for `~/.hermes/cron/jobs.json`) so this
|
||||
/// hops to a detached task and only flips `keepaliveEnabled` on
|
||||
/// MainActor when the result lands. Concurrently loads the Nous
|
||||
/// subscription record so the staleness nudge is computed off
|
||||
/// the same probe.
|
||||
private func probeKeepalive() {
|
||||
let svc = keepalive
|
||||
let nous = nousService
|
||||
Task.detached {
|
||||
let enabled = svc.isEnabled()
|
||||
let state = nous.loadState()
|
||||
await MainActor.run {
|
||||
keepaliveEnabled = enabled
|
||||
nousSubscription = state
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Section above the OAuth providers list with a single toggle
|
||||
/// that registers / removes a Scarf-owned daily cron job. The
|
||||
/// job's only purpose is to boot a Hermes session, which is what
|
||||
/// causes Hermes to refresh OAuth access tokens (no standalone
|
||||
/// CLI verb for refresh exists today). Hidden until we know the
|
||||
/// current state — flickering the toggle off→on on view appear
|
||||
/// would be confusing.
|
||||
@ViewBuilder
|
||||
private var keepaliveSection: some View {
|
||||
let isOn = keepaliveEnabled ?? false
|
||||
let stale = nousSubscription.hasStaleRefresh && keepaliveEnabled == false
|
||||
SettingsSection(title: LocalizedStringKey("Keep tokens fresh"), icon: "arrow.clockwise") {
|
||||
HStack(alignment: .top, spacing: 12) {
|
||||
Image(systemName: "arrow.clockwise.circle")
|
||||
.foregroundStyle(.secondary)
|
||||
VStack(alignment: .leading, spacing: 4) {
|
||||
Toggle(isOn: Binding(
|
||||
get: { isOn },
|
||||
set: { newValue in toggleKeepalive(to: newValue) }
|
||||
)) {
|
||||
Text("Auto-refresh OAuth tokens daily")
|
||||
.font(.system(.body, weight: .medium))
|
||||
}
|
||||
.toggleStyle(.switch)
|
||||
.disabled(keepaliveEnabled == nil || keepaliveBusy)
|
||||
Text("Registers a `\(OAuthKeepaliveCronService.jobName)` cron job that runs at 4am daily. Booting a Hermes session is what triggers token refresh — without this, refresh tokens silently expire if you go ~30 days without using Scarf.")
|
||||
.font(.caption)
|
||||
.foregroundStyle(.secondary)
|
||||
.fixedSize(horizontal: false, vertical: true)
|
||||
if stale, let days = nousSubscription.daysSinceLastRefresh() {
|
||||
HStack(spacing: 6) {
|
||||
Image(systemName: "exclamationmark.triangle.fill")
|
||||
.foregroundStyle(.orange)
|
||||
Text("Your Nous subscription was last refreshed \(days) days ago. Enable the toggle above to prevent the refresh token from expiring.")
|
||||
.font(.caption)
|
||||
.foregroundStyle(.orange)
|
||||
.fixedSize(horizontal: false, vertical: true)
|
||||
}
|
||||
.padding(.top, 4)
|
||||
}
|
||||
if let err = keepaliveError {
|
||||
Text(err)
|
||||
.font(.caption)
|
||||
.foregroundStyle(.red)
|
||||
.textSelection(.enabled)
|
||||
}
|
||||
}
|
||||
Spacer(minLength: 0)
|
||||
if keepaliveBusy {
|
||||
ProgressView().controlSize(.small)
|
||||
}
|
||||
}
|
||||
.padding(.horizontal, 12)
|
||||
.padding(.vertical, 8)
|
||||
.background(.quaternary.opacity(0.3))
|
||||
}
|
||||
}
|
||||
|
||||
private func toggleKeepalive(to newValue: Bool) {
|
||||
guard !keepaliveBusy else { return }
|
||||
keepaliveBusy = true
|
||||
keepaliveError = nil
|
||||
let svc = keepalive
|
||||
Task.detached {
|
||||
let ok = newValue ? await svc.enable() : await svc.disable()
|
||||
let actualState = svc.isEnabled()
|
||||
await MainActor.run {
|
||||
keepaliveBusy = false
|
||||
keepaliveEnabled = actualState
|
||||
if !ok {
|
||||
keepaliveError = newValue
|
||||
? "Couldn't register the keepalive cron job. Check `hermes cron` works in a terminal."
|
||||
: "Couldn't remove the keepalive cron job. Check `hermes cron remove` works in a terminal."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var header: some View {
|
||||
ScarfPageHeader(
|
||||
"Credential Pools",
|
||||
|
||||
Reference in New Issue
Block a user