mirror of
https://github.com/awizemann/scarf.git
synced 2026-05-10 18:44:45 +00:00
feat(hermes-v12): ACP multimodal image input on Mac + iOS (Phase C)
Hermes v0.12 advertises `prompt_capabilities.image = true` and accepts
image content blocks in `session/prompt`. This wires a producer flow on
both targets so users can attach images alongside text and have them
routed to the vision-capable model automatically.
Pipeline:
- ChatImageAttachment: Sendable value type holding base64 payload +
thumbnail, MIME type, source filename, and approximate byte count.
- ImageEncoder: detached-only Sendable service that downsamples to
Anthropic's 1568px long-edge cap, JPEG-encodes at q=0.85, and
produces a small inline thumbnail for composer chips. Cross-platform
(NSImage on Mac, UIImage on iOS, JPEG-passthrough on Linux/CI).
- ACPClient.sendPrompt(sessionId:text:images:) overload emits a content
array `[{type: "text"...}, {type: "image", data, mimeType}]` matching
the wire shape in hermes-agent/acp_adapter/server.py. The
zero-arg-images convenience overload preserves the v0.11 wire shape
for any unmodified callers.
Mac UI:
- RichChatInputBar grew an `attachments: [ChatImageAttachment]` state
array, a paperclip button (NSOpenPanel multi-pick), drag-drop and
paste handlers, and a horizontal preview chip strip. The "send"
callback's signature is `(String, [ChatImageAttachment]) -> Void`
threaded through RichChatView -> ChatTranscriptPane -> ChatView ->
ChatViewModel.sendText(text, images:). Image-only prompts are
permitted ("describe this") once at least one attachment is queued.
iOS UI:
- ChatView's composer adopts a paperclip + PhotosPicker flow with the
same chip strip and 5-attachment cap. Attachments live on
ChatController so they survive across PhotosPicker presentations.
loadTransferable(type: Data.self) feeds raw bytes into the same
ImageEncoder; encode work runs detached so MainActor stays
responsive on cellular.
Capability gating:
- Both composers hide the entire attachment surface when
HermesCapabilities.hasACPImagePrompts is false (pre-v0.12 hosts).
No paperclip button, no drop target, no paste accept — the input bar
is byte-for-byte the v0.11 surface against an older Hermes.
Tests: 209 ScarfCore tests pass; both Mac and iOS schemes build clean.
The encoder's pixel work is hard to unit-test at the package level
(no NSImage/UIImage in plain Swift CI) — manual end-to-end testing
is the verification path here.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -266,14 +266,47 @@ public actor ACPClient {
|
||||
// MARK: - Messaging
|
||||
|
||||
public func sendPrompt(sessionId: String, text: String) async throws -> ACPPromptResult {
|
||||
try await sendPrompt(sessionId: sessionId, text: text, images: [])
|
||||
}
|
||||
|
||||
/// v0.12+ overload: forward zero or more image attachments alongside
|
||||
/// the user's text. Each attachment becomes a separate
|
||||
/// `ImageContentBlock` in the ACP `prompt` content array — matches
|
||||
/// the shape Hermes' `acp_adapter/server.py` expects (text first,
|
||||
/// then image blocks). Hermes routes the resulting payload to a
|
||||
/// vision-capable model automatically; the producer side only has
|
||||
/// to deliver the bytes.
|
||||
///
|
||||
/// Pre-v0.12 Hermes installs accepted only a single `text` block.
|
||||
/// Callers gate this overload on
|
||||
/// `HermesCapabilitiesStore.capabilities.hasACPImagePrompts` so we
|
||||
/// don't send blocks an older agent would silently drop.
|
||||
public func sendPrompt(
|
||||
sessionId: String,
|
||||
text: String,
|
||||
images: [ChatImageAttachment]
|
||||
) async throws -> ACPPromptResult {
|
||||
statusMessage = "Sending prompt..."
|
||||
let messageId = UUID().uuidString
|
||||
|
||||
// Always include the text block, even when empty — keeps the
|
||||
// server-side text-extraction path stable regardless of whether
|
||||
// the user sent text alongside the image(s).
|
||||
var promptBlocks: [[String: Any]] = [
|
||||
["type": "text", "text": text] as [String: Any],
|
||||
]
|
||||
for image in images {
|
||||
promptBlocks.append([
|
||||
"type": "image",
|
||||
"data": image.base64Data,
|
||||
"mimeType": image.mimeType,
|
||||
] as [String: Any])
|
||||
}
|
||||
|
||||
let params: [String: AnyCodable] = [
|
||||
"sessionId": AnyCodable(sessionId),
|
||||
"messageId": AnyCodable(messageId),
|
||||
"prompt": AnyCodable([
|
||||
["type": "text", "text": text] as [String: Any],
|
||||
] as [Any]),
|
||||
"prompt": AnyCodable(promptBlocks as [Any]),
|
||||
]
|
||||
let result = try await sendRequest(method: "session/prompt", params: params)
|
||||
let dict = result?.dictValue ?? [:]
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
import Foundation
|
||||
|
||||
/// One image attached to an outgoing chat prompt.
|
||||
///
|
||||
/// Hermes v0.12 ACP advertises `prompt_capabilities.image = true` and
|
||||
/// accepts content-block arrays in `session/prompt`. Scarf produces these
|
||||
/// blocks from drag-dropped / pasted / picker-selected images. We
|
||||
/// downsample + JPEG-encode at the producer side so the wire payload
|
||||
/// stays under a few hundred kilobytes per image even when the user
|
||||
/// drops a 12 MP screenshot.
|
||||
///
|
||||
/// Constructed via `ImageEncoder.encode(...)`. The store-the-bytes-once
|
||||
/// shape means `RichChatViewModel` can keep the array between turns
|
||||
/// (e.g. while the agent is responding) without holding `NSImage` /
|
||||
/// `UIImage` references that would pin the originals in memory.
|
||||
public struct ChatImageAttachment: Sendable, Equatable, Identifiable {
|
||||
public let id: String
|
||||
/// IANA MIME type — matches the `mimeType` field on ACP `ImageContentBlock`.
|
||||
/// Currently always `image/jpeg` after re-encoding; PNG-only originals
|
||||
/// keep their type when small enough to skip the JPEG step.
|
||||
public let mimeType: String
|
||||
/// Base64-encoded payload. NOT prefixed with `data:` — Hermes wraps it
|
||||
/// when forwarding to OpenAI multimodal payloads (see
|
||||
/// `_image_block_to_openai_part` in `acp_adapter/server.py`).
|
||||
public let base64Data: String
|
||||
/// Small inline thumbnail for the composer's preview strip. Same MIME
|
||||
/// type as `base64Data`. Nil when the source was already small enough
|
||||
/// to use directly.
|
||||
public let thumbnailBase64: String?
|
||||
/// Original filename, when known (drag-drop carries it; paste doesn't).
|
||||
/// Surfaced as a tooltip on the preview chip.
|
||||
public let filename: String?
|
||||
/// Approximate decoded byte count, kept for the composer's
|
||||
/// "X images, Y KB" status pill.
|
||||
public let approximateByteCount: Int
|
||||
|
||||
public init(
|
||||
id: String = UUID().uuidString,
|
||||
mimeType: String,
|
||||
base64Data: String,
|
||||
thumbnailBase64: String?,
|
||||
filename: String?,
|
||||
approximateByteCount: Int
|
||||
) {
|
||||
self.id = id
|
||||
self.mimeType = mimeType
|
||||
self.base64Data = base64Data
|
||||
self.thumbnailBase64 = thumbnailBase64
|
||||
self.filename = filename
|
||||
self.approximateByteCount = approximateByteCount
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,162 @@
|
||||
import Foundation
|
||||
#if canImport(AppKit)
|
||||
import AppKit
|
||||
#endif
|
||||
#if canImport(UIKit)
|
||||
import UIKit
|
||||
#endif
|
||||
#if canImport(CoreImage)
|
||||
import CoreImage
|
||||
#endif
|
||||
|
||||
/// Downsamples + base64-encodes user-supplied images for ACP transport.
|
||||
///
|
||||
/// **Why downsample on the producer side.** Hermes happily forwards the
|
||||
/// bytes to a vision model, but a 12 MP screenshot at 4 MB is wasteful
|
||||
/// — it eats 5–6× more tokens than a 1024×1024 thumbnail and gives the
|
||||
/// model no extra signal. Cap the long edge at 1568 px (Anthropic's
|
||||
/// recommended max for Claude vision) and drop quality to JPEG 0.85,
|
||||
/// which keeps screenshot text crisp while landing under ~300 KB per
|
||||
/// image. The 5-image-per-message limit (chosen on the producer side)
|
||||
/// keeps the total prompt payload below ~2 MB.
|
||||
///
|
||||
/// **Why detached.** Image loading + downsampling is CPU-bound. Run only
|
||||
/// from a `Task.detached` context (the encoder type is `Sendable` and
|
||||
/// every method is `nonisolated`). The companion `ChatImageAttachment`
|
||||
/// is a Sendable value type so the result hops back to MainActor cleanly.
|
||||
public struct ImageEncoder: Sendable {
|
||||
/// Long-edge pixel cap. 1568 is Anthropic's recommended ceiling for
|
||||
/// Claude vision input — past it, the provider downsamples server-side
|
||||
/// and we just paid for the extra bytes. Tweak only with vision-model
|
||||
/// guidance from Hermes side.
|
||||
public static let maxLongEdge: CGFloat = 1568
|
||||
/// JPEG quality factor. 0.85 is the inflection point above which
|
||||
/// file size jumps quickly without obvious visual gain on screenshots
|
||||
/// or photographs.
|
||||
public static let jpegQuality: CGFloat = 0.85
|
||||
/// Long-edge cap for the inline thumbnail rendered in the composer
|
||||
/// chip. Kept under the system thumbnail size so `Image(data:)`
|
||||
/// renders without extra resampling.
|
||||
public static let thumbnailLongEdge: CGFloat = 256
|
||||
|
||||
public init() {}
|
||||
|
||||
public enum EncoderError: Error, LocalizedError {
|
||||
case unsupportedFormat
|
||||
case decodeFailed
|
||||
case encodeFailed
|
||||
case empty
|
||||
|
||||
public var errorDescription: String? {
|
||||
switch self {
|
||||
case .unsupportedFormat: return "Image format not recognized"
|
||||
case .decodeFailed: return "Couldn't decode image data"
|
||||
case .encodeFailed: return "Couldn't encode image as JPEG"
|
||||
case .empty: return "Image data was empty"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode raw bytes (from a paste/drop/picker) into a wire-ready
|
||||
/// attachment. Detached-only — never call from MainActor. The
|
||||
/// originating bytes are not retained beyond this call.
|
||||
public nonisolated func encode(
|
||||
rawBytes: Data,
|
||||
sourceFilename: String? = nil
|
||||
) throws -> ChatImageAttachment {
|
||||
guard !rawBytes.isEmpty else { throw EncoderError.empty }
|
||||
|
||||
#if canImport(AppKit)
|
||||
guard let nsImage = NSImage(data: rawBytes) else { throw EncoderError.decodeFailed }
|
||||
let targetSize = Self.fittedSize(for: nsImage.size, maxLongEdge: Self.maxLongEdge)
|
||||
let mainData = try Self.jpegBytes(from: nsImage, size: targetSize)
|
||||
let thumbSize = Self.fittedSize(for: nsImage.size, maxLongEdge: Self.thumbnailLongEdge)
|
||||
let thumbData = try? Self.jpegBytes(from: nsImage, size: thumbSize)
|
||||
return ChatImageAttachment(
|
||||
mimeType: "image/jpeg",
|
||||
base64Data: mainData.base64EncodedString(),
|
||||
thumbnailBase64: thumbData?.base64EncodedString(),
|
||||
filename: sourceFilename,
|
||||
approximateByteCount: mainData.count
|
||||
)
|
||||
|
||||
#elseif canImport(UIKit)
|
||||
guard let uiImage = UIImage(data: rawBytes) else { throw EncoderError.decodeFailed }
|
||||
let targetSize = Self.fittedSize(for: uiImage.size, maxLongEdge: Self.maxLongEdge)
|
||||
let mainData = try Self.jpegBytes(from: uiImage, size: targetSize)
|
||||
let thumbSize = Self.fittedSize(for: uiImage.size, maxLongEdge: Self.thumbnailLongEdge)
|
||||
let thumbData = try? Self.jpegBytes(from: uiImage, size: thumbSize)
|
||||
return ChatImageAttachment(
|
||||
mimeType: "image/jpeg",
|
||||
base64Data: mainData.base64EncodedString(),
|
||||
thumbnailBase64: thumbData?.base64EncodedString(),
|
||||
filename: sourceFilename,
|
||||
approximateByteCount: mainData.count
|
||||
)
|
||||
|
||||
#else
|
||||
// Linux CI / unknown platforms: pass through raw bytes if the
|
||||
// input already looks like a JPEG, else refuse. Keeps the
|
||||
// package compiling without a hard AppKit/UIKit dep.
|
||||
if rawBytes.starts(with: [0xFF, 0xD8]) {
|
||||
return ChatImageAttachment(
|
||||
mimeType: "image/jpeg",
|
||||
base64Data: rawBytes.base64EncodedString(),
|
||||
thumbnailBase64: nil,
|
||||
filename: sourceFilename,
|
||||
approximateByteCount: rawBytes.count
|
||||
)
|
||||
}
|
||||
throw EncoderError.unsupportedFormat
|
||||
#endif
|
||||
}
|
||||
|
||||
nonisolated private static func fittedSize(for source: CGSize, maxLongEdge: CGFloat) -> CGSize {
|
||||
let longest = max(source.width, source.height)
|
||||
if longest <= maxLongEdge { return source }
|
||||
let scale = maxLongEdge / longest
|
||||
return CGSize(
|
||||
width: floor(source.width * scale),
|
||||
height: floor(source.height * scale)
|
||||
)
|
||||
}
|
||||
|
||||
#if canImport(AppKit)
|
||||
nonisolated private static func jpegBytes(from image: NSImage, size: CGSize) throws -> Data {
|
||||
let resized = NSImage(size: size)
|
||||
resized.lockFocus()
|
||||
NSGraphicsContext.current?.imageInterpolation = .high
|
||||
image.draw(
|
||||
in: CGRect(origin: .zero, size: size),
|
||||
from: .zero,
|
||||
operation: .copy,
|
||||
fraction: 1.0
|
||||
)
|
||||
resized.unlockFocus()
|
||||
guard let tiff = resized.tiffRepresentation,
|
||||
let rep = NSBitmapImageRep(data: tiff),
|
||||
let data = rep.representation(
|
||||
using: .jpeg,
|
||||
properties: [.compressionFactor: jpegQuality]
|
||||
)
|
||||
else {
|
||||
throw EncoderError.encodeFailed
|
||||
}
|
||||
return data
|
||||
}
|
||||
#elseif canImport(UIKit)
|
||||
nonisolated private static func jpegBytes(from image: UIImage, size: CGSize) throws -> Data {
|
||||
let format = UIGraphicsImageRendererFormat()
|
||||
format.scale = 1
|
||||
format.opaque = true
|
||||
let renderer = UIGraphicsImageRenderer(size: size, format: format)
|
||||
let resized = renderer.image { _ in
|
||||
image.draw(in: CGRect(origin: .zero, size: size))
|
||||
}
|
||||
guard let data = resized.jpegData(compressionQuality: jpegQuality) else {
|
||||
throw EncoderError.encodeFailed
|
||||
}
|
||||
return data
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -3,6 +3,9 @@ import ScarfCore
|
||||
import ScarfIOS
|
||||
import ScarfDesign
|
||||
import os
|
||||
#if canImport(PhotosUI)
|
||||
import PhotosUI
|
||||
#endif
|
||||
|
||||
// The Chat feature on iOS is gated on `canImport(SQLite3)` because
|
||||
// `RichChatViewModel` reads session history from `HermesDataService`
|
||||
@@ -24,9 +27,23 @@ struct ChatView: View {
|
||||
|
||||
@Environment(\.scarfGoCoordinator) private var coordinator
|
||||
@Environment(\.serverContext) private var envContext
|
||||
@Environment(\.hermesCapabilities) private var capabilitiesStore
|
||||
@State private var controller: ChatController
|
||||
@State private var showProjectPicker = false
|
||||
@State private var showSlashCommandsSheet = false
|
||||
/// PhotosPicker selection. Bridge between SwiftUI's selection
|
||||
/// binding and our `ChatImageAttachment` payload — `loadTransferable`
|
||||
/// produces raw `Data` we then hand to `ImageEncoder`. v0.12+ only.
|
||||
@State private var pickerSelection: [PhotosPickerItem] = []
|
||||
@State private var showPhotoPicker = false
|
||||
@State private var isEncodingAttachment = false
|
||||
@State private var attachmentError: String?
|
||||
|
||||
private static let maxAttachments = 5
|
||||
|
||||
private var supportsImagePrompts: Bool {
|
||||
capabilitiesStore?.capabilities.hasACPImagePrompts ?? false
|
||||
}
|
||||
/// Drives the composer's keyboard. Bound to the TextField via
|
||||
/// `.focused(...)`; cleared by the scroll-to-dismiss gesture on
|
||||
/// the message list AND by an explicit keyboard-toolbar button.
|
||||
@@ -431,7 +448,108 @@ struct ChatView: View {
|
||||
}
|
||||
|
||||
private var composer: some View {
|
||||
VStack(alignment: .leading, spacing: 4) {
|
||||
if !controller.attachments.isEmpty || isEncodingAttachment || attachmentError != nil {
|
||||
attachmentStrip
|
||||
}
|
||||
composerRow
|
||||
}
|
||||
.padding(.horizontal, 12)
|
||||
.padding(.vertical, 8)
|
||||
.background(.regularMaterial)
|
||||
#if canImport(PhotosUI)
|
||||
.photosPicker(
|
||||
isPresented: $showPhotoPicker,
|
||||
selection: $pickerSelection,
|
||||
maxSelectionCount: max(0, Self.maxAttachments - controller.attachments.count),
|
||||
matching: .images
|
||||
)
|
||||
.onChange(of: pickerSelection) { _, items in
|
||||
ingestPickerItems(items)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ViewBuilder
|
||||
private var attachmentStrip: some View {
|
||||
HStack(alignment: .center, spacing: 8) {
|
||||
if isEncodingAttachment {
|
||||
ProgressView().controlSize(.small)
|
||||
Text("Encoding…")
|
||||
.font(.caption)
|
||||
.foregroundStyle(.secondary)
|
||||
}
|
||||
ForEach(controller.attachments) { attachment in
|
||||
attachmentChip(attachment)
|
||||
}
|
||||
if let err = attachmentError {
|
||||
Text(err)
|
||||
.font(.caption)
|
||||
.foregroundStyle(ScarfColor.danger)
|
||||
}
|
||||
Spacer(minLength: 0)
|
||||
if !controller.attachments.isEmpty {
|
||||
Text("\(controller.attachments.count)/\(Self.maxAttachments)")
|
||||
.font(.caption2)
|
||||
.foregroundStyle(.tertiary)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ViewBuilder
|
||||
private func attachmentChip(_ attachment: ChatImageAttachment) -> some View {
|
||||
HStack(spacing: 4) {
|
||||
attachmentChipThumbnail(attachment)
|
||||
.frame(width: 32, height: 32)
|
||||
.clipShape(RoundedRectangle(cornerRadius: 4))
|
||||
Button {
|
||||
controller.attachments.removeAll { $0.id == attachment.id }
|
||||
} label: {
|
||||
Image(systemName: "xmark.circle.fill")
|
||||
.foregroundStyle(.secondary)
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
.accessibilityLabel("Remove attached image")
|
||||
}
|
||||
.padding(.horizontal, 6)
|
||||
.padding(.vertical, 4)
|
||||
.background(
|
||||
RoundedRectangle(cornerRadius: 8)
|
||||
.fill(ScarfColor.backgroundSecondary)
|
||||
)
|
||||
}
|
||||
|
||||
@ViewBuilder
|
||||
private func attachmentChipThumbnail(_ attachment: ChatImageAttachment) -> some View {
|
||||
if let thumb = attachment.thumbnailBase64,
|
||||
let data = Data(base64Encoded: thumb),
|
||||
let image = UIImage(data: data) {
|
||||
Image(uiImage: image)
|
||||
.resizable()
|
||||
.aspectRatio(contentMode: .fill)
|
||||
} else {
|
||||
Image(systemName: "photo")
|
||||
.foregroundStyle(.secondary)
|
||||
.frame(maxWidth: .infinity, maxHeight: .infinity)
|
||||
.background(ScarfColor.backgroundSecondary)
|
||||
}
|
||||
}
|
||||
|
||||
private var composerRow: some View {
|
||||
HStack(alignment: .bottom, spacing: 8) {
|
||||
if supportsImagePrompts {
|
||||
Button {
|
||||
showPhotoPicker = true
|
||||
} label: {
|
||||
Image(systemName: "paperclip")
|
||||
.font(.system(size: 22))
|
||||
.foregroundStyle(.secondary)
|
||||
.padding(.bottom, 4)
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
.disabled(controller.state != .ready || controller.attachments.count >= Self.maxAttachments)
|
||||
.accessibilityLabel("Attach image")
|
||||
}
|
||||
TextField(
|
||||
"Message…",
|
||||
text: $controller.draft,
|
||||
@@ -480,13 +598,58 @@ struct ChatView: View {
|
||||
Image(systemName: "arrow.up.circle.fill")
|
||||
.font(.system(size: 28))
|
||||
}
|
||||
.disabled(controller.state != .ready || controller.draft.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty)
|
||||
.disabled(!canSendComposer)
|
||||
}
|
||||
.padding(.horizontal, 12)
|
||||
.padding(.vertical, 8)
|
||||
.background(.regularMaterial)
|
||||
}
|
||||
|
||||
/// Send is enabled when ready AND we have either text or at least
|
||||
/// one attachment. Image-only sends are valid for vision models.
|
||||
private var canSendComposer: Bool {
|
||||
guard controller.state == .ready else { return false }
|
||||
if !controller.attachments.isEmpty { return true }
|
||||
return !controller.draft.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty
|
||||
}
|
||||
|
||||
/// Pull JPEG/PNG bytes out of each PhotosPickerItem and feed them
|
||||
/// through ImageEncoder. Detached so the heavyweight resize +
|
||||
/// JPEG-encode work doesn't block MainActor; the resulting
|
||||
/// attachment hops back to MainActor for state mutation.
|
||||
///
|
||||
/// PhotosPickerItem can deliver `Data` directly via the
|
||||
/// `Transferable` API. After ingestion the binding is reset so a
|
||||
/// follow-up pick triggers `onChange` again.
|
||||
#if canImport(PhotosUI)
|
||||
private func ingestPickerItems(_ items: [PhotosPickerItem]) {
|
||||
guard !items.isEmpty else { return }
|
||||
// Capture the items, immediately clear the binding so a future
|
||||
// pick triggers onChange even when the user re-selects the
|
||||
// same image set. PhotosPicker behavior: identical selection
|
||||
// doesn't re-fire onChange unless the binding flips through nil.
|
||||
let snapshot = items
|
||||
pickerSelection = []
|
||||
isEncodingAttachment = true
|
||||
Task { @MainActor in
|
||||
for item in snapshot {
|
||||
guard controller.attachments.count < Self.maxAttachments else { break }
|
||||
do {
|
||||
guard let data = try await item.loadTransferable(type: Data.self) else { continue }
|
||||
let attachment = try await Task.detached(priority: .userInitiated) {
|
||||
try ImageEncoder().encode(rawBytes: data, sourceFilename: nil)
|
||||
}.value
|
||||
controller.attachments.append(attachment)
|
||||
} catch {
|
||||
attachmentError = (error as? LocalizedError)?.errorDescription ?? "Couldn't encode image"
|
||||
Task { @MainActor in
|
||||
try? await Task.sleep(nanoseconds: 4_000_000_000)
|
||||
attachmentError = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
isEncodingAttachment = false
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@State private var showErrorDetails: Bool = false
|
||||
|
||||
/// Inline error banner rendered above the message list when the
|
||||
@@ -696,6 +859,12 @@ final class ChatController {
|
||||
var vm: RichChatViewModel
|
||||
var draft: String = ""
|
||||
|
||||
/// v0.12+ image attachments queued to send with the next prompt.
|
||||
/// Capped at 5 by the composer UI; the cap matches the Mac behavior
|
||||
/// and keeps total ACP prompt payload under ~2 MB even on a slow
|
||||
/// cellular link. Cleared after each successful `send()`.
|
||||
var attachments: [ChatImageAttachment] = []
|
||||
|
||||
/// Set when chat-start is blocked because the active server's
|
||||
/// `config.yaml` has no `model.default` / `model.provider`. ChatView
|
||||
/// observes this to present an inline "pick a model" sheet — the
|
||||
@@ -1003,12 +1172,22 @@ final class ChatController {
|
||||
func send() async {
|
||||
guard state == .ready, let client else { return }
|
||||
let text = draft.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !text.isEmpty else { return }
|
||||
// v0.12+ allows image-only sends — vision models accept "describe
|
||||
// this" with no text. Bail only when both fields are empty.
|
||||
guard !text.isEmpty || !attachments.isEmpty else { return }
|
||||
let sessionId = vm.sessionId ?? ""
|
||||
guard !sessionId.isEmpty else { return }
|
||||
let images = attachments
|
||||
attachments = []
|
||||
draft = ""
|
||||
clearStoredDraft()
|
||||
vm.addUserMessage(text: text)
|
||||
if !text.isEmpty {
|
||||
vm.addUserMessage(text: text)
|
||||
} else {
|
||||
// Surface an image-only message so the user sees their bubble
|
||||
// even when they didn't type any caption.
|
||||
vm.addUserMessage(text: "[image attached]")
|
||||
}
|
||||
// /steer is non-interruptive — the agent is still on its
|
||||
// current turn; the guidance applies after the next tool call.
|
||||
// Surface a transient toast confirming the guidance was
|
||||
@@ -1029,7 +1208,7 @@ final class ChatController {
|
||||
// literally. v2.5.
|
||||
let wireText = expandIfProjectScoped(text)
|
||||
do {
|
||||
_ = try await client.sendPrompt(sessionId: sessionId, text: wireText)
|
||||
_ = try await client.sendPrompt(sessionId: sessionId, text: wireText, images: images)
|
||||
} catch {
|
||||
// The event task may already have surfaced a
|
||||
// .connectionLost; show the send-time error only if the
|
||||
|
||||
@@ -254,14 +254,32 @@ final class ChatViewModel {
|
||||
// MARK: - Send Message
|
||||
|
||||
func sendText(_ text: String) {
|
||||
sendText(text, images: [])
|
||||
}
|
||||
|
||||
/// v0.12+ overload: forward image attachments alongside the text.
|
||||
/// Empty `images` keeps the legacy v0.11 wire shape; non-empty images
|
||||
/// only flow when `HermesCapabilities.hasACPImagePrompts` is true
|
||||
/// (the input bar gates the attachment UI on the same flag, so a
|
||||
/// non-empty array reaching here means we've already verified the
|
||||
/// agent supports it).
|
||||
///
|
||||
/// Terminal mode silently drops attachments — there's no way to
|
||||
/// pipe binary content through the TTY. Surface a one-shot warning
|
||||
/// so the user knows.
|
||||
func sendText(_ text: String, images: [ChatImageAttachment]) {
|
||||
if displayMode == .richChat {
|
||||
if let client = acpClient {
|
||||
sendViaACP(client: client, text: text)
|
||||
sendViaACP(client: client, text: text, images: images)
|
||||
} else {
|
||||
// Auto-start ACP and send the queued message
|
||||
autoStartACPAndSend(text: text)
|
||||
autoStartACPAndSend(text: text, images: images)
|
||||
}
|
||||
} else if let tv = terminalView {
|
||||
if !images.isEmpty {
|
||||
logger.warning("Terminal-mode chat dropped \(images.count) image attachment(s) — image input only works in ACP rich-chat mode")
|
||||
acpError = "Image attachments require ACP mode (rich chat)."
|
||||
}
|
||||
sendToTerminal(tv, text: text + "\r")
|
||||
}
|
||||
}
|
||||
@@ -274,7 +292,7 @@ final class ChatViewModel {
|
||||
/// user never interacted with; those can be garbage-collected by Hermes
|
||||
/// between the DB read and ACP `session/load`, producing a silent prompt
|
||||
/// failure with no UI feedback.
|
||||
private func autoStartACPAndSend(text: String) {
|
||||
private func autoStartACPAndSend(text: String, images: [ChatImageAttachment] = []) {
|
||||
// Show the user message immediately
|
||||
richChatViewModel.addUserMessage(text: text)
|
||||
|
||||
@@ -313,7 +331,7 @@ final class ChatViewModel {
|
||||
acpStatus = "Connected (\(resolvedSessionId.prefix(12)))"
|
||||
|
||||
// Now send the queued prompt
|
||||
sendViaACP(client: client, text: text)
|
||||
sendViaACP(client: client, text: text, images: images)
|
||||
} catch {
|
||||
acpStatus = "Failed"
|
||||
await recordACPFailure(error, client: client, context: "Auto-start ACP failed")
|
||||
@@ -350,7 +368,7 @@ final class ChatViewModel {
|
||||
return ProjectSlashCommandService(context: context).expand(cmd, withArgument: argument)
|
||||
}
|
||||
|
||||
private func sendViaACP(client: ACPClient, text: String) {
|
||||
private func sendViaACP(client: ACPClient, text: String, images: [ChatImageAttachment] = []) {
|
||||
guard let sessionId = richChatViewModel.sessionId else {
|
||||
clearACPErrorState()
|
||||
acpError = "No session ID — cannot send"
|
||||
@@ -390,7 +408,7 @@ final class ChatViewModel {
|
||||
}
|
||||
acpPromptTask = Task { @MainActor in
|
||||
do {
|
||||
let result = try await client.sendPrompt(sessionId: sessionId, text: wireText)
|
||||
let result = try await client.sendPrompt(sessionId: sessionId, text: wireText, images: images)
|
||||
acpStatus = "Ready"
|
||||
richChatViewModel.handleACPEvent(
|
||||
.promptComplete(sessionId: sessionId, response: result)
|
||||
|
||||
@@ -9,7 +9,7 @@ import ScarfDesign
|
||||
struct ChatTranscriptPane: View {
|
||||
@Bindable var richChat: RichChatViewModel
|
||||
@Bindable var chatViewModel: ChatViewModel
|
||||
var onSend: (String) -> Void
|
||||
var onSend: (String, [ChatImageAttachment]) -> Void
|
||||
var isEnabled: Bool
|
||||
|
||||
var body: some View {
|
||||
|
||||
@@ -396,7 +396,7 @@ struct ChatView: View {
|
||||
if viewModel.hermesBinaryExists {
|
||||
RichChatView(
|
||||
richChat: viewModel.richChatViewModel,
|
||||
onSend: { viewModel.sendText($0) },
|
||||
onSend: { text, images in viewModel.sendText(text, images: images) },
|
||||
isEnabled: viewModel.hasActiveProcess || viewModel.hermesBinaryExists
|
||||
)
|
||||
} else {
|
||||
|
||||
@@ -1,20 +1,51 @@
|
||||
import SwiftUI
|
||||
import ScarfCore
|
||||
import ScarfDesign
|
||||
import UniformTypeIdentifiers
|
||||
import os
|
||||
#if canImport(AppKit)
|
||||
import AppKit
|
||||
#endif
|
||||
|
||||
struct RichChatInputBar: View {
|
||||
let onSend: (String) -> Void
|
||||
/// Send the user's text and any attached images. Empty `images`
|
||||
/// preserves the v0.11 wire shape; non-empty images are forwarded
|
||||
/// as ACP image content blocks (Hermes v0.12+; the composer hides
|
||||
/// the attachment UI on older hosts).
|
||||
let onSend: (String, [ChatImageAttachment]) -> Void
|
||||
let isEnabled: Bool
|
||||
var commands: [HermesSlashCommand] = []
|
||||
var showCompressButton: Bool = false
|
||||
|
||||
@Environment(\.hermesCapabilities) private var capabilitiesStore
|
||||
|
||||
@State private var text = ""
|
||||
@State private var showCompressSheet = false
|
||||
@State private var compressFocus = ""
|
||||
@State private var showMenu = false
|
||||
@State private var selectedIndex = 0
|
||||
@State private var attachments: [ChatImageAttachment] = []
|
||||
/// True while ImageEncoder is decoding/encoding pasted/dropped bytes.
|
||||
/// Renders a small spinner in the preview strip so the user knows
|
||||
/// their drop landed.
|
||||
@State private var isEncodingAttachment = false
|
||||
/// User-visible failure (decode failed, format unsupported). Auto-clears.
|
||||
@State private var attachmentError: String?
|
||||
@FocusState private var isFocused: Bool
|
||||
|
||||
/// Hard cap matches what Hermes' vision aux model swallows comfortably
|
||||
/// in one prompt. Going higher costs tokens without a quality gain.
|
||||
private static let maxAttachments = 5
|
||||
|
||||
private static let logger = Logger(subsystem: "com.scarf", category: "ChatComposer")
|
||||
|
||||
/// `nil` until detection finishes — we hide the attachment UI in
|
||||
/// that brief window (~50ms locally, longer over SSH) so we never
|
||||
/// flash an attachment chip a v0.11 host couldn't honor.
|
||||
private var supportsImagePrompts: Bool {
|
||||
capabilitiesStore?.capabilities.hasACPImagePrompts ?? false
|
||||
}
|
||||
|
||||
var body: some View {
|
||||
VStack(alignment: .leading, spacing: 0) {
|
||||
if showMenu {
|
||||
@@ -36,6 +67,10 @@ struct RichChatInputBar: View {
|
||||
.padding(.top, 8)
|
||||
}
|
||||
|
||||
if !attachments.isEmpty || isEncodingAttachment || attachmentError != nil {
|
||||
attachmentStrip
|
||||
}
|
||||
|
||||
HStack(alignment: .bottom, spacing: ScarfSpace.s2) {
|
||||
if showCompressButton {
|
||||
Button {
|
||||
@@ -52,6 +87,10 @@ struct RichChatInputBar: View {
|
||||
.help("Compress conversation (/compress)")
|
||||
}
|
||||
|
||||
if supportsImagePrompts {
|
||||
attachmentButton
|
||||
}
|
||||
|
||||
TextEditor(text: $text)
|
||||
.font(ScarfFont.body)
|
||||
.scrollContentBackground(.hidden)
|
||||
@@ -70,7 +109,9 @@ struct RichChatInputBar: View {
|
||||
)
|
||||
.overlay(alignment: .topLeading) {
|
||||
if text.isEmpty {
|
||||
Text("Message Hermes… / for commands")
|
||||
Text(supportsImagePrompts
|
||||
? "Message Hermes… / for commands · drag images to attach"
|
||||
: "Message Hermes… / for commands")
|
||||
.scarfStyle(.body)
|
||||
.foregroundStyle(ScarfColor.foregroundFaint)
|
||||
.padding(.horizontal, 14)
|
||||
@@ -78,6 +119,25 @@ struct RichChatInputBar: View {
|
||||
.allowsHitTesting(false)
|
||||
}
|
||||
}
|
||||
// Drag-drop image attachments. Receives both file URLs
|
||||
// (from Finder) and raw image bitmap data (from
|
||||
// screenshot tools that drop tiff/png directly).
|
||||
// Capability-gated so v0.11 hosts don't surface a
|
||||
// drop target that does nothing.
|
||||
.onDrop(
|
||||
of: supportsImagePrompts ? [.image, .fileURL] : [],
|
||||
isTargeted: nil
|
||||
) { providers in
|
||||
guard supportsImagePrompts else { return false }
|
||||
ingestProviders(providers)
|
||||
return true
|
||||
}
|
||||
// Paste from screenshots / browser context menu.
|
||||
// Accepting `Data` keeps us off `NSImage` which would
|
||||
// require AppKit-typed paste. v0.12+ only.
|
||||
.onPasteCommand(of: pasteAcceptedTypes) { providers in
|
||||
ingestProviders(providers)
|
||||
}
|
||||
.onKeyPress(.upArrow, phases: .down) { _ in
|
||||
guard showMenu, !filteredCommands.isEmpty else { return .ignored }
|
||||
let n = filteredCommands.count
|
||||
@@ -148,6 +208,96 @@ struct RichChatInputBar: View {
|
||||
}
|
||||
}
|
||||
|
||||
/// Horizontal preview strip for attached images. Each chip shows the
|
||||
/// thumbnail (or a placeholder icon if we couldn't render one) plus
|
||||
/// an X to remove the attachment.
|
||||
@ViewBuilder
|
||||
private var attachmentStrip: some View {
|
||||
HStack(alignment: .center, spacing: ScarfSpace.s2) {
|
||||
if isEncodingAttachment {
|
||||
ProgressView()
|
||||
.controlSize(.small)
|
||||
Text("Encoding…")
|
||||
.scarfStyle(.caption)
|
||||
.foregroundStyle(ScarfColor.foregroundMuted)
|
||||
}
|
||||
ForEach(attachments) { attachment in
|
||||
attachmentChip(attachment)
|
||||
}
|
||||
if let err = attachmentError {
|
||||
Text(err)
|
||||
.scarfStyle(.caption)
|
||||
.foregroundStyle(ScarfColor.danger)
|
||||
}
|
||||
Spacer(minLength: 0)
|
||||
if !attachments.isEmpty {
|
||||
Text("\(attachments.count)/\(Self.maxAttachments)")
|
||||
.scarfStyle(.caption)
|
||||
.foregroundStyle(ScarfColor.foregroundFaint)
|
||||
}
|
||||
}
|
||||
.padding(.horizontal, ScarfSpace.s3)
|
||||
.padding(.top, ScarfSpace.s2)
|
||||
}
|
||||
|
||||
@ViewBuilder
|
||||
private func attachmentChip(_ attachment: ChatImageAttachment) -> some View {
|
||||
let thumb = chipThumbnail(for: attachment)
|
||||
HStack(spacing: 4) {
|
||||
thumb
|
||||
.frame(width: 32, height: 32)
|
||||
.clipShape(RoundedRectangle(cornerRadius: 4))
|
||||
Button {
|
||||
attachments.removeAll { $0.id == attachment.id }
|
||||
} label: {
|
||||
Image(systemName: "xmark.circle.fill")
|
||||
.font(.system(size: 14))
|
||||
.foregroundStyle(ScarfColor.foregroundMuted)
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
.help(attachment.filename ?? "Image attachment")
|
||||
}
|
||||
.padding(.horizontal, 6)
|
||||
.padding(.vertical, 4)
|
||||
.background(
|
||||
RoundedRectangle(cornerRadius: ScarfRadius.md)
|
||||
.fill(ScarfColor.backgroundTertiary)
|
||||
)
|
||||
}
|
||||
|
||||
/// Render the inline thumbnail for a chip. Falls back to a generic
|
||||
/// photo icon when the encoder didn't produce a thumbnail (e.g. the
|
||||
/// image was already small enough to skip the resize step).
|
||||
@ViewBuilder
|
||||
private func chipThumbnail(for attachment: ChatImageAttachment) -> some View {
|
||||
if let thumb = attachment.thumbnailBase64,
|
||||
let data = Data(base64Encoded: thumb),
|
||||
let image = NSImage(data: data) {
|
||||
Image(nsImage: image)
|
||||
.resizable()
|
||||
.aspectRatio(contentMode: .fill)
|
||||
} else {
|
||||
Image(systemName: "photo")
|
||||
.foregroundStyle(ScarfColor.foregroundMuted)
|
||||
.frame(maxWidth: .infinity, maxHeight: .infinity)
|
||||
.background(ScarfColor.backgroundSecondary)
|
||||
}
|
||||
}
|
||||
|
||||
private var attachmentButton: some View {
|
||||
Button {
|
||||
presentImagePicker()
|
||||
} label: {
|
||||
Image(systemName: "paperclip")
|
||||
.font(.system(size: 16))
|
||||
.foregroundStyle(ScarfColor.foregroundMuted)
|
||||
.padding(6)
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
.disabled(!isEnabled || attachments.count >= Self.maxAttachments)
|
||||
.help("Attach image (\(attachments.count)/\(Self.maxAttachments))")
|
||||
}
|
||||
|
||||
private var compressSheet: some View {
|
||||
VStack(alignment: .leading, spacing: ScarfSpace.s3) {
|
||||
Text("Compress Conversation")
|
||||
@@ -164,7 +314,7 @@ struct RichChatInputBar: View {
|
||||
Button("Compress") {
|
||||
let focus = compressFocus.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
let command = focus.isEmpty ? "/compress" : "/compress \(focus)"
|
||||
onSend(command)
|
||||
onSend(command, [])
|
||||
showCompressSheet = false
|
||||
}
|
||||
.buttonStyle(ScarfPrimaryButton())
|
||||
@@ -176,7 +326,18 @@ struct RichChatInputBar: View {
|
||||
}
|
||||
|
||||
private var canSend: Bool {
|
||||
isEnabled && !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty
|
||||
guard isEnabled else { return false }
|
||||
// Allow sending image-only messages once at least one attachment
|
||||
// exists — vision models accept "describe this" with no text.
|
||||
if !attachments.isEmpty { return true }
|
||||
return !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty
|
||||
}
|
||||
|
||||
/// MIME types accepted for paste. Restricting to image-bearing
|
||||
/// providers stops macOS from offering a paste menu when the user
|
||||
/// has plain text on the clipboard.
|
||||
private var pasteAcceptedTypes: [UTType] {
|
||||
supportsImagePrompts ? [.image, .png, .jpeg, .tiff, .heic] : []
|
||||
}
|
||||
|
||||
/// Show the slash menu only while the user is typing the command token:
|
||||
@@ -224,12 +385,116 @@ struct RichChatInputBar: View {
|
||||
|
||||
private func send() {
|
||||
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !trimmed.isEmpty, isEnabled else { return }
|
||||
onSend(trimmed)
|
||||
guard canSend else { return }
|
||||
onSend(trimmed, attachments)
|
||||
text = ""
|
||||
attachments.removeAll()
|
||||
showMenu = false
|
||||
selectedIndex = 0
|
||||
}
|
||||
|
||||
// MARK: - Attachment ingestion
|
||||
|
||||
/// Pull image bytes out of a set of `NSItemProvider`s (drag/drop or
|
||||
/// paste). Each provider may carry a file URL OR raw image data —
|
||||
/// we try both. Caps at `maxAttachments`; surplus drops are
|
||||
/// dropped silently with a status message.
|
||||
private func ingestProviders(_ providers: [NSItemProvider]) {
|
||||
let remainingSlots = Self.maxAttachments - attachments.count
|
||||
guard remainingSlots > 0 else {
|
||||
attachmentError = "Limit of \(Self.maxAttachments) images reached"
|
||||
scheduleAttachmentErrorClear()
|
||||
return
|
||||
}
|
||||
let toIngest = providers.prefix(remainingSlots)
|
||||
for provider in toIngest {
|
||||
ingestProvider(provider)
|
||||
}
|
||||
}
|
||||
|
||||
private func ingestProvider(_ provider: NSItemProvider) {
|
||||
// Prefer file URL when available — gives us the original filename
|
||||
// for the attachment chip's tooltip.
|
||||
if provider.hasItemConformingToTypeIdentifier(UTType.fileURL.identifier) {
|
||||
isEncodingAttachment = true
|
||||
provider.loadObject(ofClass: URL.self) { url, _ in
|
||||
guard let url, let data = try? Data(contentsOf: url) else {
|
||||
Task { @MainActor in
|
||||
isEncodingAttachment = false
|
||||
attachmentError = "Couldn't read dropped file"
|
||||
scheduleAttachmentErrorClear()
|
||||
}
|
||||
return
|
||||
}
|
||||
encode(data: data, filename: url.lastPathComponent)
|
||||
}
|
||||
return
|
||||
}
|
||||
for typeId in [UTType.image.identifier, UTType.png.identifier, UTType.jpeg.identifier, UTType.tiff.identifier, UTType.heic.identifier] {
|
||||
if provider.hasItemConformingToTypeIdentifier(typeId) {
|
||||
isEncodingAttachment = true
|
||||
provider.loadDataRepresentation(forTypeIdentifier: typeId) { data, _ in
|
||||
guard let data else {
|
||||
Task { @MainActor in
|
||||
isEncodingAttachment = false
|
||||
attachmentError = "Couldn't decode pasted image"
|
||||
scheduleAttachmentErrorClear()
|
||||
}
|
||||
return
|
||||
}
|
||||
encode(data: data, filename: nil)
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func encode(data: Data, filename: String?) {
|
||||
Task.detached(priority: .userInitiated) {
|
||||
do {
|
||||
let attachment = try ImageEncoder().encode(rawBytes: data, sourceFilename: filename)
|
||||
await MainActor.run {
|
||||
isEncodingAttachment = false
|
||||
attachments.append(attachment)
|
||||
}
|
||||
} catch {
|
||||
await MainActor.run {
|
||||
isEncodingAttachment = false
|
||||
attachmentError = (error as? LocalizedError)?.errorDescription ?? "Couldn't encode image"
|
||||
Self.logger.warning("ImageEncoder failed: \(error.localizedDescription, privacy: .public)")
|
||||
scheduleAttachmentErrorClear()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func scheduleAttachmentErrorClear() {
|
||||
Task { @MainActor in
|
||||
try? await Task.sleep(nanoseconds: 4_000_000_000)
|
||||
attachmentError = nil
|
||||
}
|
||||
}
|
||||
|
||||
private func presentImagePicker() {
|
||||
#if canImport(AppKit)
|
||||
let panel = NSOpenPanel()
|
||||
panel.allowsMultipleSelection = true
|
||||
panel.canChooseDirectories = false
|
||||
panel.canChooseFiles = true
|
||||
panel.allowedContentTypes = [.image, .png, .jpeg, .tiff, .heic]
|
||||
panel.message = "Choose images to attach"
|
||||
panel.prompt = "Attach"
|
||||
let response = panel.runModal()
|
||||
guard response == .OK else { return }
|
||||
let urls = panel.urls
|
||||
let remainingSlots = Self.maxAttachments - attachments.count
|
||||
for url in urls.prefix(remainingSlots) {
|
||||
guard let data = try? Data(contentsOf: url) else { continue }
|
||||
isEncodingAttachment = true
|
||||
encode(data: data, filename: url.lastPathComponent)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
private extension Array {
|
||||
|
||||
@@ -17,7 +17,7 @@ import ScarfDesign
|
||||
/// can scroll horizontally inside the panes rather than losing them.
|
||||
struct RichChatView: View {
|
||||
@Bindable var richChat: RichChatViewModel
|
||||
var onSend: (String) -> Void
|
||||
var onSend: (String, [ChatImageAttachment]) -> Void
|
||||
var isEnabled: Bool
|
||||
@Environment(HermesFileWatcher.self) private var fileWatcher
|
||||
@Environment(ChatViewModel.self) private var chatViewModel
|
||||
|
||||
Reference in New Issue
Block a user