diff --git a/scarf/Packages/ScarfCore/Sources/ScarfCore/ACP/ACPClient.swift b/scarf/Packages/ScarfCore/Sources/ScarfCore/ACP/ACPClient.swift index b611684..ba146d0 100644 --- a/scarf/Packages/ScarfCore/Sources/ScarfCore/ACP/ACPClient.swift +++ b/scarf/Packages/ScarfCore/Sources/ScarfCore/ACP/ACPClient.swift @@ -266,14 +266,47 @@ public actor ACPClient { // MARK: - Messaging public func sendPrompt(sessionId: String, text: String) async throws -> ACPPromptResult { + try await sendPrompt(sessionId: sessionId, text: text, images: []) + } + + /// v0.12+ overload: forward zero or more image attachments alongside + /// the user's text. Each attachment becomes a separate + /// `ImageContentBlock` in the ACP `prompt` content array — matches + /// the shape Hermes' `acp_adapter/server.py` expects (text first, + /// then image blocks). Hermes routes the resulting payload to a + /// vision-capable model automatically; the producer side only has + /// to deliver the bytes. + /// + /// Pre-v0.12 Hermes installs accepted only a single `text` block. + /// Callers gate this overload on + /// `HermesCapabilitiesStore.capabilities.hasACPImagePrompts` so we + /// don't send blocks an older agent would silently drop. + public func sendPrompt( + sessionId: String, + text: String, + images: [ChatImageAttachment] + ) async throws -> ACPPromptResult { statusMessage = "Sending prompt..." let messageId = UUID().uuidString + + // Always include the text block, even when empty — keeps the + // server-side text-extraction path stable regardless of whether + // the user sent text alongside the image(s). + var promptBlocks: [[String: Any]] = [ + ["type": "text", "text": text] as [String: Any], + ] + for image in images { + promptBlocks.append([ + "type": "image", + "data": image.base64Data, + "mimeType": image.mimeType, + ] as [String: Any]) + } + let params: [String: AnyCodable] = [ "sessionId": AnyCodable(sessionId), "messageId": AnyCodable(messageId), - "prompt": AnyCodable([ - ["type": "text", "text": text] as [String: Any], - ] as [Any]), + "prompt": AnyCodable(promptBlocks as [Any]), ] let result = try await sendRequest(method: "session/prompt", params: params) let dict = result?.dictValue ?? [:] diff --git a/scarf/Packages/ScarfCore/Sources/ScarfCore/Models/ChatImageAttachment.swift b/scarf/Packages/ScarfCore/Sources/ScarfCore/Models/ChatImageAttachment.swift new file mode 100644 index 0000000..6ad3ca9 --- /dev/null +++ b/scarf/Packages/ScarfCore/Sources/ScarfCore/Models/ChatImageAttachment.swift @@ -0,0 +1,52 @@ +import Foundation + +/// One image attached to an outgoing chat prompt. +/// +/// Hermes v0.12 ACP advertises `prompt_capabilities.image = true` and +/// accepts content-block arrays in `session/prompt`. Scarf produces these +/// blocks from drag-dropped / pasted / picker-selected images. We +/// downsample + JPEG-encode at the producer side so the wire payload +/// stays under a few hundred kilobytes per image even when the user +/// drops a 12 MP screenshot. +/// +/// Constructed via `ImageEncoder.encode(...)`. The store-the-bytes-once +/// shape means `RichChatViewModel` can keep the array between turns +/// (e.g. while the agent is responding) without holding `NSImage` / +/// `UIImage` references that would pin the originals in memory. +public struct ChatImageAttachment: Sendable, Equatable, Identifiable { + public let id: String + /// IANA MIME type — matches the `mimeType` field on ACP `ImageContentBlock`. + /// Currently always `image/jpeg` after re-encoding; PNG-only originals + /// keep their type when small enough to skip the JPEG step. + public let mimeType: String + /// Base64-encoded payload. NOT prefixed with `data:` — Hermes wraps it + /// when forwarding to OpenAI multimodal payloads (see + /// `_image_block_to_openai_part` in `acp_adapter/server.py`). + public let base64Data: String + /// Small inline thumbnail for the composer's preview strip. Same MIME + /// type as `base64Data`. Nil when the source was already small enough + /// to use directly. + public let thumbnailBase64: String? + /// Original filename, when known (drag-drop carries it; paste doesn't). + /// Surfaced as a tooltip on the preview chip. + public let filename: String? + /// Approximate decoded byte count, kept for the composer's + /// "X images, Y KB" status pill. + public let approximateByteCount: Int + + public init( + id: String = UUID().uuidString, + mimeType: String, + base64Data: String, + thumbnailBase64: String?, + filename: String?, + approximateByteCount: Int + ) { + self.id = id + self.mimeType = mimeType + self.base64Data = base64Data + self.thumbnailBase64 = thumbnailBase64 + self.filename = filename + self.approximateByteCount = approximateByteCount + } +} diff --git a/scarf/Packages/ScarfCore/Sources/ScarfCore/Services/ImageEncoder.swift b/scarf/Packages/ScarfCore/Sources/ScarfCore/Services/ImageEncoder.swift new file mode 100644 index 0000000..d2e1d57 --- /dev/null +++ b/scarf/Packages/ScarfCore/Sources/ScarfCore/Services/ImageEncoder.swift @@ -0,0 +1,162 @@ +import Foundation +#if canImport(AppKit) +import AppKit +#endif +#if canImport(UIKit) +import UIKit +#endif +#if canImport(CoreImage) +import CoreImage +#endif + +/// Downsamples + base64-encodes user-supplied images for ACP transport. +/// +/// **Why downsample on the producer side.** Hermes happily forwards the +/// bytes to a vision model, but a 12 MP screenshot at 4 MB is wasteful +/// — it eats 5–6× more tokens than a 1024×1024 thumbnail and gives the +/// model no extra signal. Cap the long edge at 1568 px (Anthropic's +/// recommended max for Claude vision) and drop quality to JPEG 0.85, +/// which keeps screenshot text crisp while landing under ~300 KB per +/// image. The 5-image-per-message limit (chosen on the producer side) +/// keeps the total prompt payload below ~2 MB. +/// +/// **Why detached.** Image loading + downsampling is CPU-bound. Run only +/// from a `Task.detached` context (the encoder type is `Sendable` and +/// every method is `nonisolated`). The companion `ChatImageAttachment` +/// is a Sendable value type so the result hops back to MainActor cleanly. +public struct ImageEncoder: Sendable { + /// Long-edge pixel cap. 1568 is Anthropic's recommended ceiling for + /// Claude vision input — past it, the provider downsamples server-side + /// and we just paid for the extra bytes. Tweak only with vision-model + /// guidance from Hermes side. + public static let maxLongEdge: CGFloat = 1568 + /// JPEG quality factor. 0.85 is the inflection point above which + /// file size jumps quickly without obvious visual gain on screenshots + /// or photographs. + public static let jpegQuality: CGFloat = 0.85 + /// Long-edge cap for the inline thumbnail rendered in the composer + /// chip. Kept under the system thumbnail size so `Image(data:)` + /// renders without extra resampling. + public static let thumbnailLongEdge: CGFloat = 256 + + public init() {} + + public enum EncoderError: Error, LocalizedError { + case unsupportedFormat + case decodeFailed + case encodeFailed + case empty + + public var errorDescription: String? { + switch self { + case .unsupportedFormat: return "Image format not recognized" + case .decodeFailed: return "Couldn't decode image data" + case .encodeFailed: return "Couldn't encode image as JPEG" + case .empty: return "Image data was empty" + } + } + } + + /// Encode raw bytes (from a paste/drop/picker) into a wire-ready + /// attachment. Detached-only — never call from MainActor. The + /// originating bytes are not retained beyond this call. + public nonisolated func encode( + rawBytes: Data, + sourceFilename: String? = nil + ) throws -> ChatImageAttachment { + guard !rawBytes.isEmpty else { throw EncoderError.empty } + + #if canImport(AppKit) + guard let nsImage = NSImage(data: rawBytes) else { throw EncoderError.decodeFailed } + let targetSize = Self.fittedSize(for: nsImage.size, maxLongEdge: Self.maxLongEdge) + let mainData = try Self.jpegBytes(from: nsImage, size: targetSize) + let thumbSize = Self.fittedSize(for: nsImage.size, maxLongEdge: Self.thumbnailLongEdge) + let thumbData = try? Self.jpegBytes(from: nsImage, size: thumbSize) + return ChatImageAttachment( + mimeType: "image/jpeg", + base64Data: mainData.base64EncodedString(), + thumbnailBase64: thumbData?.base64EncodedString(), + filename: sourceFilename, + approximateByteCount: mainData.count + ) + + #elseif canImport(UIKit) + guard let uiImage = UIImage(data: rawBytes) else { throw EncoderError.decodeFailed } + let targetSize = Self.fittedSize(for: uiImage.size, maxLongEdge: Self.maxLongEdge) + let mainData = try Self.jpegBytes(from: uiImage, size: targetSize) + let thumbSize = Self.fittedSize(for: uiImage.size, maxLongEdge: Self.thumbnailLongEdge) + let thumbData = try? Self.jpegBytes(from: uiImage, size: thumbSize) + return ChatImageAttachment( + mimeType: "image/jpeg", + base64Data: mainData.base64EncodedString(), + thumbnailBase64: thumbData?.base64EncodedString(), + filename: sourceFilename, + approximateByteCount: mainData.count + ) + + #else + // Linux CI / unknown platforms: pass through raw bytes if the + // input already looks like a JPEG, else refuse. Keeps the + // package compiling without a hard AppKit/UIKit dep. + if rawBytes.starts(with: [0xFF, 0xD8]) { + return ChatImageAttachment( + mimeType: "image/jpeg", + base64Data: rawBytes.base64EncodedString(), + thumbnailBase64: nil, + filename: sourceFilename, + approximateByteCount: rawBytes.count + ) + } + throw EncoderError.unsupportedFormat + #endif + } + + nonisolated private static func fittedSize(for source: CGSize, maxLongEdge: CGFloat) -> CGSize { + let longest = max(source.width, source.height) + if longest <= maxLongEdge { return source } + let scale = maxLongEdge / longest + return CGSize( + width: floor(source.width * scale), + height: floor(source.height * scale) + ) + } + + #if canImport(AppKit) + nonisolated private static func jpegBytes(from image: NSImage, size: CGSize) throws -> Data { + let resized = NSImage(size: size) + resized.lockFocus() + NSGraphicsContext.current?.imageInterpolation = .high + image.draw( + in: CGRect(origin: .zero, size: size), + from: .zero, + operation: .copy, + fraction: 1.0 + ) + resized.unlockFocus() + guard let tiff = resized.tiffRepresentation, + let rep = NSBitmapImageRep(data: tiff), + let data = rep.representation( + using: .jpeg, + properties: [.compressionFactor: jpegQuality] + ) + else { + throw EncoderError.encodeFailed + } + return data + } + #elseif canImport(UIKit) + nonisolated private static func jpegBytes(from image: UIImage, size: CGSize) throws -> Data { + let format = UIGraphicsImageRendererFormat() + format.scale = 1 + format.opaque = true + let renderer = UIGraphicsImageRenderer(size: size, format: format) + let resized = renderer.image { _ in + image.draw(in: CGRect(origin: .zero, size: size)) + } + guard let data = resized.jpegData(compressionQuality: jpegQuality) else { + throw EncoderError.encodeFailed + } + return data + } + #endif +} diff --git a/scarf/Scarf iOS/Chat/ChatView.swift b/scarf/Scarf iOS/Chat/ChatView.swift index d9ae223..f34e97b 100644 --- a/scarf/Scarf iOS/Chat/ChatView.swift +++ b/scarf/Scarf iOS/Chat/ChatView.swift @@ -3,6 +3,9 @@ import ScarfCore import ScarfIOS import ScarfDesign import os +#if canImport(PhotosUI) +import PhotosUI +#endif // The Chat feature on iOS is gated on `canImport(SQLite3)` because // `RichChatViewModel` reads session history from `HermesDataService` @@ -24,9 +27,23 @@ struct ChatView: View { @Environment(\.scarfGoCoordinator) private var coordinator @Environment(\.serverContext) private var envContext + @Environment(\.hermesCapabilities) private var capabilitiesStore @State private var controller: ChatController @State private var showProjectPicker = false @State private var showSlashCommandsSheet = false + /// PhotosPicker selection. Bridge between SwiftUI's selection + /// binding and our `ChatImageAttachment` payload — `loadTransferable` + /// produces raw `Data` we then hand to `ImageEncoder`. v0.12+ only. + @State private var pickerSelection: [PhotosPickerItem] = [] + @State private var showPhotoPicker = false + @State private var isEncodingAttachment = false + @State private var attachmentError: String? + + private static let maxAttachments = 5 + + private var supportsImagePrompts: Bool { + capabilitiesStore?.capabilities.hasACPImagePrompts ?? false + } /// Drives the composer's keyboard. Bound to the TextField via /// `.focused(...)`; cleared by the scroll-to-dismiss gesture on /// the message list AND by an explicit keyboard-toolbar button. @@ -431,7 +448,108 @@ struct ChatView: View { } private var composer: some View { + VStack(alignment: .leading, spacing: 4) { + if !controller.attachments.isEmpty || isEncodingAttachment || attachmentError != nil { + attachmentStrip + } + composerRow + } + .padding(.horizontal, 12) + .padding(.vertical, 8) + .background(.regularMaterial) + #if canImport(PhotosUI) + .photosPicker( + isPresented: $showPhotoPicker, + selection: $pickerSelection, + maxSelectionCount: max(0, Self.maxAttachments - controller.attachments.count), + matching: .images + ) + .onChange(of: pickerSelection) { _, items in + ingestPickerItems(items) + } + #endif + } + + @ViewBuilder + private var attachmentStrip: some View { + HStack(alignment: .center, spacing: 8) { + if isEncodingAttachment { + ProgressView().controlSize(.small) + Text("Encoding…") + .font(.caption) + .foregroundStyle(.secondary) + } + ForEach(controller.attachments) { attachment in + attachmentChip(attachment) + } + if let err = attachmentError { + Text(err) + .font(.caption) + .foregroundStyle(ScarfColor.danger) + } + Spacer(minLength: 0) + if !controller.attachments.isEmpty { + Text("\(controller.attachments.count)/\(Self.maxAttachments)") + .font(.caption2) + .foregroundStyle(.tertiary) + } + } + } + + @ViewBuilder + private func attachmentChip(_ attachment: ChatImageAttachment) -> some View { + HStack(spacing: 4) { + attachmentChipThumbnail(attachment) + .frame(width: 32, height: 32) + .clipShape(RoundedRectangle(cornerRadius: 4)) + Button { + controller.attachments.removeAll { $0.id == attachment.id } + } label: { + Image(systemName: "xmark.circle.fill") + .foregroundStyle(.secondary) + } + .buttonStyle(.plain) + .accessibilityLabel("Remove attached image") + } + .padding(.horizontal, 6) + .padding(.vertical, 4) + .background( + RoundedRectangle(cornerRadius: 8) + .fill(ScarfColor.backgroundSecondary) + ) + } + + @ViewBuilder + private func attachmentChipThumbnail(_ attachment: ChatImageAttachment) -> some View { + if let thumb = attachment.thumbnailBase64, + let data = Data(base64Encoded: thumb), + let image = UIImage(data: data) { + Image(uiImage: image) + .resizable() + .aspectRatio(contentMode: .fill) + } else { + Image(systemName: "photo") + .foregroundStyle(.secondary) + .frame(maxWidth: .infinity, maxHeight: .infinity) + .background(ScarfColor.backgroundSecondary) + } + } + + private var composerRow: some View { HStack(alignment: .bottom, spacing: 8) { + if supportsImagePrompts { + Button { + showPhotoPicker = true + } label: { + Image(systemName: "paperclip") + .font(.system(size: 22)) + .foregroundStyle(.secondary) + .padding(.bottom, 4) + } + .buttonStyle(.plain) + .disabled(controller.state != .ready || controller.attachments.count >= Self.maxAttachments) + .accessibilityLabel("Attach image") + } TextField( "Message…", text: $controller.draft, @@ -480,13 +598,58 @@ struct ChatView: View { Image(systemName: "arrow.up.circle.fill") .font(.system(size: 28)) } - .disabled(controller.state != .ready || controller.draft.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty) + .disabled(!canSendComposer) } - .padding(.horizontal, 12) - .padding(.vertical, 8) - .background(.regularMaterial) } + /// Send is enabled when ready AND we have either text or at least + /// one attachment. Image-only sends are valid for vision models. + private var canSendComposer: Bool { + guard controller.state == .ready else { return false } + if !controller.attachments.isEmpty { return true } + return !controller.draft.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + } + + /// Pull JPEG/PNG bytes out of each PhotosPickerItem and feed them + /// through ImageEncoder. Detached so the heavyweight resize + + /// JPEG-encode work doesn't block MainActor; the resulting + /// attachment hops back to MainActor for state mutation. + /// + /// PhotosPickerItem can deliver `Data` directly via the + /// `Transferable` API. After ingestion the binding is reset so a + /// follow-up pick triggers `onChange` again. + #if canImport(PhotosUI) + private func ingestPickerItems(_ items: [PhotosPickerItem]) { + guard !items.isEmpty else { return } + // Capture the items, immediately clear the binding so a future + // pick triggers onChange even when the user re-selects the + // same image set. PhotosPicker behavior: identical selection + // doesn't re-fire onChange unless the binding flips through nil. + let snapshot = items + pickerSelection = [] + isEncodingAttachment = true + Task { @MainActor in + for item in snapshot { + guard controller.attachments.count < Self.maxAttachments else { break } + do { + guard let data = try await item.loadTransferable(type: Data.self) else { continue } + let attachment = try await Task.detached(priority: .userInitiated) { + try ImageEncoder().encode(rawBytes: data, sourceFilename: nil) + }.value + controller.attachments.append(attachment) + } catch { + attachmentError = (error as? LocalizedError)?.errorDescription ?? "Couldn't encode image" + Task { @MainActor in + try? await Task.sleep(nanoseconds: 4_000_000_000) + attachmentError = nil + } + } + } + isEncodingAttachment = false + } + } + #endif + @State private var showErrorDetails: Bool = false /// Inline error banner rendered above the message list when the @@ -696,6 +859,12 @@ final class ChatController { var vm: RichChatViewModel var draft: String = "" + /// v0.12+ image attachments queued to send with the next prompt. + /// Capped at 5 by the composer UI; the cap matches the Mac behavior + /// and keeps total ACP prompt payload under ~2 MB even on a slow + /// cellular link. Cleared after each successful `send()`. + var attachments: [ChatImageAttachment] = [] + /// Set when chat-start is blocked because the active server's /// `config.yaml` has no `model.default` / `model.provider`. ChatView /// observes this to present an inline "pick a model" sheet — the @@ -1003,12 +1172,22 @@ final class ChatController { func send() async { guard state == .ready, let client else { return } let text = draft.trimmingCharacters(in: .whitespacesAndNewlines) - guard !text.isEmpty else { return } + // v0.12+ allows image-only sends — vision models accept "describe + // this" with no text. Bail only when both fields are empty. + guard !text.isEmpty || !attachments.isEmpty else { return } let sessionId = vm.sessionId ?? "" guard !sessionId.isEmpty else { return } + let images = attachments + attachments = [] draft = "" clearStoredDraft() - vm.addUserMessage(text: text) + if !text.isEmpty { + vm.addUserMessage(text: text) + } else { + // Surface an image-only message so the user sees their bubble + // even when they didn't type any caption. + vm.addUserMessage(text: "[image attached]") + } // /steer is non-interruptive — the agent is still on its // current turn; the guidance applies after the next tool call. // Surface a transient toast confirming the guidance was @@ -1029,7 +1208,7 @@ final class ChatController { // literally. v2.5. let wireText = expandIfProjectScoped(text) do { - _ = try await client.sendPrompt(sessionId: sessionId, text: wireText) + _ = try await client.sendPrompt(sessionId: sessionId, text: wireText, images: images) } catch { // The event task may already have surfaced a // .connectionLost; show the send-time error only if the diff --git a/scarf/scarf/Features/Chat/ViewModels/ChatViewModel.swift b/scarf/scarf/Features/Chat/ViewModels/ChatViewModel.swift index 287eb2b..854f2c2 100644 --- a/scarf/scarf/Features/Chat/ViewModels/ChatViewModel.swift +++ b/scarf/scarf/Features/Chat/ViewModels/ChatViewModel.swift @@ -254,14 +254,32 @@ final class ChatViewModel { // MARK: - Send Message func sendText(_ text: String) { + sendText(text, images: []) + } + + /// v0.12+ overload: forward image attachments alongside the text. + /// Empty `images` keeps the legacy v0.11 wire shape; non-empty images + /// only flow when `HermesCapabilities.hasACPImagePrompts` is true + /// (the input bar gates the attachment UI on the same flag, so a + /// non-empty array reaching here means we've already verified the + /// agent supports it). + /// + /// Terminal mode silently drops attachments — there's no way to + /// pipe binary content through the TTY. Surface a one-shot warning + /// so the user knows. + func sendText(_ text: String, images: [ChatImageAttachment]) { if displayMode == .richChat { if let client = acpClient { - sendViaACP(client: client, text: text) + sendViaACP(client: client, text: text, images: images) } else { // Auto-start ACP and send the queued message - autoStartACPAndSend(text: text) + autoStartACPAndSend(text: text, images: images) } } else if let tv = terminalView { + if !images.isEmpty { + logger.warning("Terminal-mode chat dropped \(images.count) image attachment(s) — image input only works in ACP rich-chat mode") + acpError = "Image attachments require ACP mode (rich chat)." + } sendToTerminal(tv, text: text + "\r") } } @@ -274,7 +292,7 @@ final class ChatViewModel { /// user never interacted with; those can be garbage-collected by Hermes /// between the DB read and ACP `session/load`, producing a silent prompt /// failure with no UI feedback. - private func autoStartACPAndSend(text: String) { + private func autoStartACPAndSend(text: String, images: [ChatImageAttachment] = []) { // Show the user message immediately richChatViewModel.addUserMessage(text: text) @@ -313,7 +331,7 @@ final class ChatViewModel { acpStatus = "Connected (\(resolvedSessionId.prefix(12)))" // Now send the queued prompt - sendViaACP(client: client, text: text) + sendViaACP(client: client, text: text, images: images) } catch { acpStatus = "Failed" await recordACPFailure(error, client: client, context: "Auto-start ACP failed") @@ -350,7 +368,7 @@ final class ChatViewModel { return ProjectSlashCommandService(context: context).expand(cmd, withArgument: argument) } - private func sendViaACP(client: ACPClient, text: String) { + private func sendViaACP(client: ACPClient, text: String, images: [ChatImageAttachment] = []) { guard let sessionId = richChatViewModel.sessionId else { clearACPErrorState() acpError = "No session ID — cannot send" @@ -390,7 +408,7 @@ final class ChatViewModel { } acpPromptTask = Task { @MainActor in do { - let result = try await client.sendPrompt(sessionId: sessionId, text: wireText) + let result = try await client.sendPrompt(sessionId: sessionId, text: wireText, images: images) acpStatus = "Ready" richChatViewModel.handleACPEvent( .promptComplete(sessionId: sessionId, response: result) diff --git a/scarf/scarf/Features/Chat/Views/ChatTranscriptPane.swift b/scarf/scarf/Features/Chat/Views/ChatTranscriptPane.swift index 272a566..f3e9a0f 100644 --- a/scarf/scarf/Features/Chat/Views/ChatTranscriptPane.swift +++ b/scarf/scarf/Features/Chat/Views/ChatTranscriptPane.swift @@ -9,7 +9,7 @@ import ScarfDesign struct ChatTranscriptPane: View { @Bindable var richChat: RichChatViewModel @Bindable var chatViewModel: ChatViewModel - var onSend: (String) -> Void + var onSend: (String, [ChatImageAttachment]) -> Void var isEnabled: Bool var body: some View { diff --git a/scarf/scarf/Features/Chat/Views/ChatView.swift b/scarf/scarf/Features/Chat/Views/ChatView.swift index 2f6b0f0..e6ca780 100644 --- a/scarf/scarf/Features/Chat/Views/ChatView.swift +++ b/scarf/scarf/Features/Chat/Views/ChatView.swift @@ -396,7 +396,7 @@ struct ChatView: View { if viewModel.hermesBinaryExists { RichChatView( richChat: viewModel.richChatViewModel, - onSend: { viewModel.sendText($0) }, + onSend: { text, images in viewModel.sendText(text, images: images) }, isEnabled: viewModel.hasActiveProcess || viewModel.hermesBinaryExists ) } else { diff --git a/scarf/scarf/Features/Chat/Views/RichChatInputBar.swift b/scarf/scarf/Features/Chat/Views/RichChatInputBar.swift index 26ee7be..7cc32c4 100644 --- a/scarf/scarf/Features/Chat/Views/RichChatInputBar.swift +++ b/scarf/scarf/Features/Chat/Views/RichChatInputBar.swift @@ -1,20 +1,51 @@ import SwiftUI import ScarfCore import ScarfDesign +import UniformTypeIdentifiers +import os +#if canImport(AppKit) +import AppKit +#endif struct RichChatInputBar: View { - let onSend: (String) -> Void + /// Send the user's text and any attached images. Empty `images` + /// preserves the v0.11 wire shape; non-empty images are forwarded + /// as ACP image content blocks (Hermes v0.12+; the composer hides + /// the attachment UI on older hosts). + let onSend: (String, [ChatImageAttachment]) -> Void let isEnabled: Bool var commands: [HermesSlashCommand] = [] var showCompressButton: Bool = false + @Environment(\.hermesCapabilities) private var capabilitiesStore + @State private var text = "" @State private var showCompressSheet = false @State private var compressFocus = "" @State private var showMenu = false @State private var selectedIndex = 0 + @State private var attachments: [ChatImageAttachment] = [] + /// True while ImageEncoder is decoding/encoding pasted/dropped bytes. + /// Renders a small spinner in the preview strip so the user knows + /// their drop landed. + @State private var isEncodingAttachment = false + /// User-visible failure (decode failed, format unsupported). Auto-clears. + @State private var attachmentError: String? @FocusState private var isFocused: Bool + /// Hard cap matches what Hermes' vision aux model swallows comfortably + /// in one prompt. Going higher costs tokens without a quality gain. + private static let maxAttachments = 5 + + private static let logger = Logger(subsystem: "com.scarf", category: "ChatComposer") + + /// `nil` until detection finishes — we hide the attachment UI in + /// that brief window (~50ms locally, longer over SSH) so we never + /// flash an attachment chip a v0.11 host couldn't honor. + private var supportsImagePrompts: Bool { + capabilitiesStore?.capabilities.hasACPImagePrompts ?? false + } + var body: some View { VStack(alignment: .leading, spacing: 0) { if showMenu { @@ -36,6 +67,10 @@ struct RichChatInputBar: View { .padding(.top, 8) } + if !attachments.isEmpty || isEncodingAttachment || attachmentError != nil { + attachmentStrip + } + HStack(alignment: .bottom, spacing: ScarfSpace.s2) { if showCompressButton { Button { @@ -52,6 +87,10 @@ struct RichChatInputBar: View { .help("Compress conversation (/compress)") } + if supportsImagePrompts { + attachmentButton + } + TextEditor(text: $text) .font(ScarfFont.body) .scrollContentBackground(.hidden) @@ -70,7 +109,9 @@ struct RichChatInputBar: View { ) .overlay(alignment: .topLeading) { if text.isEmpty { - Text("Message Hermes… / for commands") + Text(supportsImagePrompts + ? "Message Hermes… / for commands · drag images to attach" + : "Message Hermes… / for commands") .scarfStyle(.body) .foregroundStyle(ScarfColor.foregroundFaint) .padding(.horizontal, 14) @@ -78,6 +119,25 @@ struct RichChatInputBar: View { .allowsHitTesting(false) } } + // Drag-drop image attachments. Receives both file URLs + // (from Finder) and raw image bitmap data (from + // screenshot tools that drop tiff/png directly). + // Capability-gated so v0.11 hosts don't surface a + // drop target that does nothing. + .onDrop( + of: supportsImagePrompts ? [.image, .fileURL] : [], + isTargeted: nil + ) { providers in + guard supportsImagePrompts else { return false } + ingestProviders(providers) + return true + } + // Paste from screenshots / browser context menu. + // Accepting `Data` keeps us off `NSImage` which would + // require AppKit-typed paste. v0.12+ only. + .onPasteCommand(of: pasteAcceptedTypes) { providers in + ingestProviders(providers) + } .onKeyPress(.upArrow, phases: .down) { _ in guard showMenu, !filteredCommands.isEmpty else { return .ignored } let n = filteredCommands.count @@ -148,6 +208,96 @@ struct RichChatInputBar: View { } } + /// Horizontal preview strip for attached images. Each chip shows the + /// thumbnail (or a placeholder icon if we couldn't render one) plus + /// an X to remove the attachment. + @ViewBuilder + private var attachmentStrip: some View { + HStack(alignment: .center, spacing: ScarfSpace.s2) { + if isEncodingAttachment { + ProgressView() + .controlSize(.small) + Text("Encoding…") + .scarfStyle(.caption) + .foregroundStyle(ScarfColor.foregroundMuted) + } + ForEach(attachments) { attachment in + attachmentChip(attachment) + } + if let err = attachmentError { + Text(err) + .scarfStyle(.caption) + .foregroundStyle(ScarfColor.danger) + } + Spacer(minLength: 0) + if !attachments.isEmpty { + Text("\(attachments.count)/\(Self.maxAttachments)") + .scarfStyle(.caption) + .foregroundStyle(ScarfColor.foregroundFaint) + } + } + .padding(.horizontal, ScarfSpace.s3) + .padding(.top, ScarfSpace.s2) + } + + @ViewBuilder + private func attachmentChip(_ attachment: ChatImageAttachment) -> some View { + let thumb = chipThumbnail(for: attachment) + HStack(spacing: 4) { + thumb + .frame(width: 32, height: 32) + .clipShape(RoundedRectangle(cornerRadius: 4)) + Button { + attachments.removeAll { $0.id == attachment.id } + } label: { + Image(systemName: "xmark.circle.fill") + .font(.system(size: 14)) + .foregroundStyle(ScarfColor.foregroundMuted) + } + .buttonStyle(.plain) + .help(attachment.filename ?? "Image attachment") + } + .padding(.horizontal, 6) + .padding(.vertical, 4) + .background( + RoundedRectangle(cornerRadius: ScarfRadius.md) + .fill(ScarfColor.backgroundTertiary) + ) + } + + /// Render the inline thumbnail for a chip. Falls back to a generic + /// photo icon when the encoder didn't produce a thumbnail (e.g. the + /// image was already small enough to skip the resize step). + @ViewBuilder + private func chipThumbnail(for attachment: ChatImageAttachment) -> some View { + if let thumb = attachment.thumbnailBase64, + let data = Data(base64Encoded: thumb), + let image = NSImage(data: data) { + Image(nsImage: image) + .resizable() + .aspectRatio(contentMode: .fill) + } else { + Image(systemName: "photo") + .foregroundStyle(ScarfColor.foregroundMuted) + .frame(maxWidth: .infinity, maxHeight: .infinity) + .background(ScarfColor.backgroundSecondary) + } + } + + private var attachmentButton: some View { + Button { + presentImagePicker() + } label: { + Image(systemName: "paperclip") + .font(.system(size: 16)) + .foregroundStyle(ScarfColor.foregroundMuted) + .padding(6) + } + .buttonStyle(.plain) + .disabled(!isEnabled || attachments.count >= Self.maxAttachments) + .help("Attach image (\(attachments.count)/\(Self.maxAttachments))") + } + private var compressSheet: some View { VStack(alignment: .leading, spacing: ScarfSpace.s3) { Text("Compress Conversation") @@ -164,7 +314,7 @@ struct RichChatInputBar: View { Button("Compress") { let focus = compressFocus.trimmingCharacters(in: .whitespacesAndNewlines) let command = focus.isEmpty ? "/compress" : "/compress \(focus)" - onSend(command) + onSend(command, []) showCompressSheet = false } .buttonStyle(ScarfPrimaryButton()) @@ -176,7 +326,18 @@ struct RichChatInputBar: View { } private var canSend: Bool { - isEnabled && !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + guard isEnabled else { return false } + // Allow sending image-only messages once at least one attachment + // exists — vision models accept "describe this" with no text. + if !attachments.isEmpty { return true } + return !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + } + + /// MIME types accepted for paste. Restricting to image-bearing + /// providers stops macOS from offering a paste menu when the user + /// has plain text on the clipboard. + private var pasteAcceptedTypes: [UTType] { + supportsImagePrompts ? [.image, .png, .jpeg, .tiff, .heic] : [] } /// Show the slash menu only while the user is typing the command token: @@ -224,12 +385,116 @@ struct RichChatInputBar: View { private func send() { let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) - guard !trimmed.isEmpty, isEnabled else { return } - onSend(trimmed) + guard canSend else { return } + onSend(trimmed, attachments) text = "" + attachments.removeAll() showMenu = false selectedIndex = 0 } + + // MARK: - Attachment ingestion + + /// Pull image bytes out of a set of `NSItemProvider`s (drag/drop or + /// paste). Each provider may carry a file URL OR raw image data — + /// we try both. Caps at `maxAttachments`; surplus drops are + /// dropped silently with a status message. + private func ingestProviders(_ providers: [NSItemProvider]) { + let remainingSlots = Self.maxAttachments - attachments.count + guard remainingSlots > 0 else { + attachmentError = "Limit of \(Self.maxAttachments) images reached" + scheduleAttachmentErrorClear() + return + } + let toIngest = providers.prefix(remainingSlots) + for provider in toIngest { + ingestProvider(provider) + } + } + + private func ingestProvider(_ provider: NSItemProvider) { + // Prefer file URL when available — gives us the original filename + // for the attachment chip's tooltip. + if provider.hasItemConformingToTypeIdentifier(UTType.fileURL.identifier) { + isEncodingAttachment = true + provider.loadObject(ofClass: URL.self) { url, _ in + guard let url, let data = try? Data(contentsOf: url) else { + Task { @MainActor in + isEncodingAttachment = false + attachmentError = "Couldn't read dropped file" + scheduleAttachmentErrorClear() + } + return + } + encode(data: data, filename: url.lastPathComponent) + } + return + } + for typeId in [UTType.image.identifier, UTType.png.identifier, UTType.jpeg.identifier, UTType.tiff.identifier, UTType.heic.identifier] { + if provider.hasItemConformingToTypeIdentifier(typeId) { + isEncodingAttachment = true + provider.loadDataRepresentation(forTypeIdentifier: typeId) { data, _ in + guard let data else { + Task { @MainActor in + isEncodingAttachment = false + attachmentError = "Couldn't decode pasted image" + scheduleAttachmentErrorClear() + } + return + } + encode(data: data, filename: nil) + } + return + } + } + } + + private func encode(data: Data, filename: String?) { + Task.detached(priority: .userInitiated) { + do { + let attachment = try ImageEncoder().encode(rawBytes: data, sourceFilename: filename) + await MainActor.run { + isEncodingAttachment = false + attachments.append(attachment) + } + } catch { + await MainActor.run { + isEncodingAttachment = false + attachmentError = (error as? LocalizedError)?.errorDescription ?? "Couldn't encode image" + Self.logger.warning("ImageEncoder failed: \(error.localizedDescription, privacy: .public)") + scheduleAttachmentErrorClear() + } + } + } + } + + private func scheduleAttachmentErrorClear() { + Task { @MainActor in + try? await Task.sleep(nanoseconds: 4_000_000_000) + attachmentError = nil + } + } + + private func presentImagePicker() { + #if canImport(AppKit) + let panel = NSOpenPanel() + panel.allowsMultipleSelection = true + panel.canChooseDirectories = false + panel.canChooseFiles = true + panel.allowedContentTypes = [.image, .png, .jpeg, .tiff, .heic] + panel.message = "Choose images to attach" + panel.prompt = "Attach" + let response = panel.runModal() + guard response == .OK else { return } + let urls = panel.urls + let remainingSlots = Self.maxAttachments - attachments.count + for url in urls.prefix(remainingSlots) { + guard let data = try? Data(contentsOf: url) else { continue } + isEncodingAttachment = true + encode(data: data, filename: url.lastPathComponent) + } + #endif + } } private extension Array { diff --git a/scarf/scarf/Features/Chat/Views/RichChatView.swift b/scarf/scarf/Features/Chat/Views/RichChatView.swift index c7e7d86..c2ee6c0 100644 --- a/scarf/scarf/Features/Chat/Views/RichChatView.swift +++ b/scarf/scarf/Features/Chat/Views/RichChatView.swift @@ -17,7 +17,7 @@ import ScarfDesign /// can scroll horizontally inside the panes rather than losing them. struct RichChatView: View { @Bindable var richChat: RichChatViewModel - var onSend: (String) -> Void + var onSend: (String, [ChatImageAttachment]) -> Void var isEnabled: Bool @Environment(HermesFileWatcher.self) private var fileWatcher @Environment(ChatViewModel.self) private var chatViewModel