diff --git a/scarf/scarf/Core/Services/MessageSpeechService.swift b/scarf/scarf/Core/Services/MessageSpeechService.swift new file mode 100644 index 0000000..71ada56 --- /dev/null +++ b/scarf/scarf/Core/Services/MessageSpeechService.swift @@ -0,0 +1,110 @@ +import Foundation +import AVFoundation +import os +import Observation + +/// Per-message text-to-speech for assistant chat replies (issue #66). +/// Uses `AVSpeechSynthesizer` with the system voice — no Hermes +/// dependency, works offline, picks up the user's macOS Spoken Content +/// voice selection automatically. +/// +/// One synthesizer is shared across the app so starting a second +/// message's playback automatically interrupts the first. The +/// per-message speaker button reads `playingMessageId` to render +/// play vs. stop state. +/// +/// The full Hermes-provider TTS pipeline (Edge / ElevenLabs / OpenAI +/// / NeuTTS / Piper from Settings → Voice) is deferred to a follow-up +/// — wiring per-provider audio fetching, caching, and interruption +/// is a much bigger surface than what's needed to give users a +/// listen-while-doing-other-work affordance today. +@MainActor +@Observable +final class MessageSpeechService: NSObject { + static let shared = MessageSpeechService() + + /// The message id currently being spoken, or `nil` when idle. + /// Bubbles read this to flip their speaker icon to a stop glyph. + private(set) var playingMessageId: Int? + + private let synthesizer = AVSpeechSynthesizer() + private let logger = Logger(subsystem: "com.scarf", category: "MessageSpeech") + + private override init() { + super.init() + synthesizer.delegate = self + } + + /// Speak `content`. If a different message is currently playing, + /// interrupt it. If the same message is currently playing, this + /// stops playback (toggle behavior). + func toggle(messageId: Int, content: String) { + if playingMessageId == messageId { + stop() + return + } + if synthesizer.isSpeaking { + synthesizer.stopSpeaking(at: .immediate) + } + let cleaned = Self.strippedForSpeech(content) + guard !cleaned.isEmpty else { return } + let utterance = AVSpeechUtterance(string: cleaned) + // AVSpeechUtterance honors the user's Spoken Content default + // voice when `voice` is `nil`, which is the right behavior: + // users who configured a specific macOS voice get it + // automatically. + utterance.rate = AVSpeechUtteranceDefaultSpeechRate + playingMessageId = messageId + synthesizer.speak(utterance) + } + + /// Stop any in-progress speech and clear `playingMessageId`. + func stop() { + guard playingMessageId != nil else { return } + synthesizer.stopSpeaking(at: .immediate) + playingMessageId = nil + } + + /// Strip markdown control characters before speech so the user + /// doesn't hear "asterisk asterisk bold". Code fences and inline + /// code are spoken verbatim minus the backticks. Keeps URLs + /// readable but drops square-bracket link wrappers. + static func strippedForSpeech(_ raw: String) -> String { + var out = raw + // Fenced code blocks → keep contents + out = out.replacingOccurrences(of: "```", with: "") + // Inline code → drop backticks + out = out.replacingOccurrences(of: "`", with: "") + // Bold/italic markers + out = out.replacingOccurrences(of: "**", with: "") + out = out.replacingOccurrences(of: "__", with: "") + // Link syntax: [text](url) → text + if let regex = try? NSRegularExpression( + pattern: #"\[([^\]]+)\]\([^)]+\)"#, + options: [] + ) { + let range = NSRange(out.startIndex..., in: out) + out = regex.stringByReplacingMatches( + in: out, + options: [], + range: range, + withTemplate: "$1" + ) + } + return out.trimmingCharacters(in: .whitespacesAndNewlines) + } +} + +extension MessageSpeechService: @preconcurrency AVSpeechSynthesizerDelegate { + nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) { + Task { @MainActor in + self.playingMessageId = nil + } + } + + nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) { + Task { @MainActor in + self.playingMessageId = nil + } + } +} diff --git a/scarf/scarf/Features/Chat/Views/RichMessageBubble.swift b/scarf/scarf/Features/Chat/Views/RichMessageBubble.swift index d9c792d..8f0f685 100644 --- a/scarf/scarf/Features/Chat/Views/RichMessageBubble.swift +++ b/scarf/scarf/Features/Chat/Views/RichMessageBubble.swift @@ -364,11 +364,52 @@ struct RichMessageBubble: View, Equatable { .font(ChatFontScale.monoSmall(chatFontScale)) .help("Wall-clock duration of this turn") } + // Per-message TTS playback toggle (issue #66). Only on + // settled assistant bubbles — streaming bubble (id == 0) + // would speak partial text. Empty content has nothing to + // speak. + if message.id != 0, !message.content.isEmpty { + speakButton + } } .font(ChatFontScale.caption(chatFontScale)) .foregroundStyle(ScarfColor.foregroundFaint) .padding(.leading, 4) } + + /// Speaker glyph that toggles `AVSpeechSynthesizer` playback for + /// the assistant reply. Lives in its own view so the + /// `MessageSpeechService` observation doesn't fight the bubble's + /// `Equatable` short-circuit — the parent only needs to pass + /// stable id + content; this view re-renders on its own when + /// playback state flips. + private var speakButton: some View { + SpeakMessageButton(messageId: message.id, content: message.content) + } +} + +/// Stand-alone speaker button so the `MessageSpeechService` +/// observation doesn't get short-circuited by `RichMessageBubble`'s +/// `Equatable`. Only the button re-renders when playback flips — +/// the bubble itself stays optimised. +private struct SpeakMessageButton: View { + let messageId: Int + let content: String + + @State private var speech = MessageSpeechService.shared + + var body: some View { + let isPlaying = speech.playingMessageId == messageId + Button { + speech.toggle(messageId: messageId, content: content) + } label: { + Image(systemName: isPlaying ? "stop.circle.fill" : "speaker.wave.2") + .font(.system(size: 11)) + .foregroundStyle(isPlaying ? ScarfColor.accent : ScarfColor.foregroundFaint) + } + .buttonStyle(.plain) + .help(isPlaying ? "Stop speaking" : "Read this reply aloud") + } } // MARK: - Content Block Parsing