feat(chat): per-message TTS playback in assistant bubbles (#66)

Adds a small speaker glyph to the metadata footer of each settled assistant bubble. Tap to read the reply aloud through `AVSpeechSynthesizer`; tap again (or any other bubble's button) to stop. Picks up the user's macOS Spoken Content default voice automatically — no Hermes dependency, works offline. - New `MessageSpeechService` (`Core/Services/`) — shared `@Observable` synthesizer; `playingMessageId` drives icon state. Markdown control characters (asterisks, backticks, link syntax) are stripped before speech so the user doesn't hear "asterisk asterisk bold". - `SpeakMessageButton` lives outside `RichMessageBubble.==` so the bubble's Equatable short-circuit doesn't freeze the icon when playback flips between messages. The full Hermes-provider TTS pipeline (Edge / ElevenLabs / OpenAI / NeuTTS / Piper from Settings → Voice) is a much bigger follow-up — wiring per-provider audio fetching, caching, and streamed playback is its own quarter. v2.6.0 ships the immediate "listen while doing something else" affordance. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 02:26:37 +00:00 · 2026-05-01 15:38:22 +02:00
parent 596c844da5
commit 254af46e93
2 changed files with 151 additions and 0 deletions
@@ -0,0 +1,110 @@
+import Foundation
+import AVFoundation
+import os
+import Observation
+
+/// Per-message text-to-speech for assistant chat replies (issue #66).
+/// Uses `AVSpeechSynthesizer` with the system voice — no Hermes
+/// dependency, works offline, picks up the user's macOS Spoken Content
+/// voice selection automatically.
+///
+/// One synthesizer is shared across the app so starting a second
+/// message's playback automatically interrupts the first. The
+/// per-message speaker button reads `playingMessageId` to render
+/// play vs. stop state.
+///
+/// The full Hermes-provider TTS pipeline (Edge / ElevenLabs / OpenAI
+/// / NeuTTS / Piper from Settings → Voice) is deferred to a follow-up
+/// — wiring per-provider audio fetching, caching, and interruption
+/// is a much bigger surface than what's needed to give users a
+/// listen-while-doing-other-work affordance today.
+@MainActor
+@Observable
+final class MessageSpeechService: NSObject {
+    static let shared = MessageSpeechService()
+
+    /// The message id currently being spoken, or `nil` when idle.
+    /// Bubbles read this to flip their speaker icon to a stop glyph.
+    private(set) var playingMessageId: Int?
+
+    private let synthesizer = AVSpeechSynthesizer()
+    private let logger = Logger(subsystem: "com.scarf", category: "MessageSpeech")
+
+    private override init() {
+        super.init()
+        synthesizer.delegate = self
+    }
+
+    /// Speak `content`. If a different message is currently playing,
+    /// interrupt it. If the same message is currently playing, this
+    /// stops playback (toggle behavior).
+    func toggle(messageId: Int, content: String) {
+        if playingMessageId == messageId {
+            stop()
+            return
+        }
+        if synthesizer.isSpeaking {
+            synthesizer.stopSpeaking(at: .immediate)
+        }
+        let cleaned = Self.strippedForSpeech(content)
+        guard !cleaned.isEmpty else { return }
+        let utterance = AVSpeechUtterance(string: cleaned)
+        // AVSpeechUtterance honors the user's Spoken Content default
+        // voice when `voice` is `nil`, which is the right behavior:
+        // users who configured a specific macOS voice get it
+        // automatically.
+        utterance.rate = AVSpeechUtteranceDefaultSpeechRate
+        playingMessageId = messageId
+        synthesizer.speak(utterance)
+    }
+
+    /// Stop any in-progress speech and clear `playingMessageId`.
+    func stop() {
+        guard playingMessageId != nil else { return }
+        synthesizer.stopSpeaking(at: .immediate)
+        playingMessageId = nil
+    }
+
+    /// Strip markdown control characters before speech so the user
+    /// doesn't hear "asterisk asterisk bold". Code fences and inline
+    /// code are spoken verbatim minus the backticks. Keeps URLs
+    /// readable but drops square-bracket link wrappers.
+    static func strippedForSpeech(_ raw: String) -> String {
+        var out = raw
+        // Fenced code blocks → keep contents
+        out = out.replacingOccurrences(of: "```", with: "")
+        // Inline code → drop backticks
+        out = out.replacingOccurrences(of: "`", with: "")
+        // Bold/italic markers
+        out = out.replacingOccurrences(of: "**", with: "")
+        out = out.replacingOccurrences(of: "__", with: "")
+        // Link syntax: [text](url) → text
+        if let regex = try? NSRegularExpression(
+            pattern: #"\[([^\]]+)\]\([^)]+\)"#,
+            options: []
+        ) {
+            let range = NSRange(out.startIndex..., in: out)
+            out = regex.stringByReplacingMatches(
+                in: out,
+                options: [],
+                range: range,
+                withTemplate: "$1"
+            )
+        }
+        return out.trimmingCharacters(in: .whitespacesAndNewlines)
+    }
+}
+
+extension MessageSpeechService: @preconcurrency AVSpeechSynthesizerDelegate {
+    nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
+        Task { @MainActor in
+            self.playingMessageId = nil
+        }
+    }
+
+    nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
+        Task { @MainActor in
+            self.playingMessageId = nil
+        }
+    }
+}
@@ -364,11 +364,52 @@ struct RichMessageBubble: View, Equatable {
                    .font(ChatFontScale.monoSmall(chatFontScale))
                    .help("Wall-clock duration of this turn")
            }
+            // Per-message TTS playback toggle (issue #66). Only on
+            // settled assistant bubbles — streaming bubble (id == 0)
+            // would speak partial text. Empty content has nothing to
+            // speak.
+            if message.id != 0, !message.content.isEmpty {
+                speakButton
+            }
        }
        .font(ChatFontScale.caption(chatFontScale))
        .foregroundStyle(ScarfColor.foregroundFaint)
        .padding(.leading, 4)
    }
+
+    /// Speaker glyph that toggles `AVSpeechSynthesizer` playback for
+    /// the assistant reply. Lives in its own view so the
+    /// `MessageSpeechService` observation doesn't fight the bubble's
+    /// `Equatable` short-circuit — the parent only needs to pass
+    /// stable id + content; this view re-renders on its own when
+    /// playback state flips.
+    private var speakButton: some View {
+        SpeakMessageButton(messageId: message.id, content: message.content)
+    }
+}
+
+/// Stand-alone speaker button so the `MessageSpeechService`
+/// observation doesn't get short-circuited by `RichMessageBubble`'s
+/// `Equatable`. Only the button re-renders when playback flips —
+/// the bubble itself stays optimised.
+private struct SpeakMessageButton: View {
+    let messageId: Int
+    let content: String
+
+    @State private var speech = MessageSpeechService.shared
+
+    var body: some View {
+        let isPlaying = speech.playingMessageId == messageId
+        Button {
+            speech.toggle(messageId: messageId, content: content)
+        } label: {
+            Image(systemName: isPlaying ? "stop.circle.fill" : "speaker.wave.2")
+                .font(.system(size: 11))
+                .foregroundStyle(isPlaying ? ScarfColor.accent : ScarfColor.foregroundFaint)
+        }
+        .buttonStyle(.plain)
+        .help(isPlaying ? "Stop speaking" : "Read this reply aloud")
+    }
 }

 // MARK: - Content Block Parsing