feat(chat): per-message TTS playback in assistant bubbles (#66)

Adds a small speaker glyph to the metadata footer of each settled assistant bubble. Tap to read the reply aloud through `AVSpeechSynthesizer`; tap again (or any other bubble's button) to stop. Picks up the user's macOS Spoken Content default voice automatically — no Hermes dependency, works offline. - New `MessageSpeechService` (`Core/Services/`) — shared `@Observable` synthesizer; `playingMessageId` drives icon state. Markdown control characters (asterisks, backticks, link syntax) are stripped before speech so the user doesn't hear "asterisk asterisk bold". - `SpeakMessageButton` lives outside `RichMessageBubble.==` so the bubble's Equatable short-circuit doesn't freeze the icon when playback flips between messages. The full Hermes-provider TTS pipeline (Edge / ElevenLabs / OpenAI / NeuTTS / Piper from Settings → Voice) is a much bigger follow-up — wiring per-provider audio fetching, caching, and streamed playback is its own quarter. v2.6.0 ships the immediate "listen while doing something else" affordance. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 10:36:35 +00:00 · 2026-05-01 15:38:22 +02:00
parent 596c844da5
commit 254af46e93
2 changed files with 151 additions and 0 deletions
@@ -0,0 +1,110 @@
 import Foundation
 import AVFoundation
 import os
 import Observation
 /// Per-message text-to-speech for assistant chat replies (issue #66).
 /// Uses `AVSpeechSynthesizer` with the system voice — no Hermes
 /// dependency, works offline, picks up the user's macOS Spoken Content
 /// voice selection automatically.
 ///
 /// One synthesizer is shared across the app so starting a second
 /// message's playback automatically interrupts the first. The
 /// per-message speaker button reads `playingMessageId` to render
 /// play vs. stop state.
 ///
 /// The full Hermes-provider TTS pipeline (Edge / ElevenLabs / OpenAI
 /// / NeuTTS / Piper from Settings → Voice) is deferred to a follow-up
 /// — wiring per-provider audio fetching, caching, and interruption
 /// is a much bigger surface than what's needed to give users a
 /// listen-while-doing-other-work affordance today.
@MainActor
@Observable
 final class MessageSpeechService: NSObject {
    static let shared = MessageSpeechService()
    /// The message id currently being spoken, or `nil` when idle.
    /// Bubbles read this to flip their speaker icon to a stop glyph.
    private(set) var playingMessageId: Int?
    private let synthesizer = AVSpeechSynthesizer()
    private let logger = Logger(subsystem: "com.scarf", category: "MessageSpeech")
    private override init() {
        super.init()
        synthesizer.delegate = self
    }
    /// Speak `content`. If a different message is currently playing,
    /// interrupt it. If the same message is currently playing, this
    /// stops playback (toggle behavior).
    func toggle(messageId: Int, content: String) {
        if playingMessageId == messageId {
            stop()
            return
        }
        if synthesizer.isSpeaking {
            synthesizer.stopSpeaking(at: .immediate)
        }
        let cleaned = Self.strippedForSpeech(content)
        guard !cleaned.isEmpty else { return }
        let utterance = AVSpeechUtterance(string: cleaned)
        // AVSpeechUtterance honors the user's Spoken Content default
        // voice when `voice` is `nil`, which is the right behavior:
        // users who configured a specific macOS voice get it
        // automatically.
        utterance.rate = AVSpeechUtteranceDefaultSpeechRate
        playingMessageId = messageId
        synthesizer.speak(utterance)
    }
    /// Stop any in-progress speech and clear `playingMessageId`.
    func stop() {
        guard playingMessageId != nil else { return }
        synthesizer.stopSpeaking(at: .immediate)
        playingMessageId = nil
    }
    /// Strip markdown control characters before speech so the user
    /// doesn't hear "asterisk asterisk bold". Code fences and inline
    /// code are spoken verbatim minus the backticks. Keeps URLs
    /// readable but drops square-bracket link wrappers.
    static func strippedForSpeech(_ raw: String) -> String {
        var out = raw
        // Fenced code blocks → keep contents
        out = out.replacingOccurrences(of: "```", with: "")
        // Inline code → drop backticks
        out = out.replacingOccurrences(of: "`", with: "")
        // Bold/italic markers
        out = out.replacingOccurrences(of: "**", with: "")
        out = out.replacingOccurrences(of: "__", with: "")
        // Link syntax: [text](url) → text
        if let regex = try? NSRegularExpression(
            pattern: #"\[([^\]]+)\]\([^)]+\)"#,
            options: []
        ) {
            let range = NSRange(out.startIndex..., in: out)
            out = regex.stringByReplacingMatches(
                in: out,
                options: [],
                range: range,
                withTemplate: "$1"
            )
        }
        return out.trimmingCharacters(in: .whitespacesAndNewlines)
    }
 }
 extension MessageSpeechService: @preconcurrency AVSpeechSynthesizerDelegate {
    nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
        Task { @MainActor in
            self.playingMessageId = nil
        }
    }
    nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
        Task { @MainActor in
            self.playingMessageId = nil
        }
    }
 }
@@ -364,11 +364,52 @@ struct RichMessageBubble: View, Equatable {
                    .font(ChatFontScale.monoSmall(chatFontScale))
                    .help("Wall-clock duration of this turn")
            }
            // Per-message TTS playback toggle (issue #66). Only on
            // settled assistant bubbles — streaming bubble (id == 0)
            // would speak partial text. Empty content has nothing to
            // speak.
            if message.id != 0, !message.content.isEmpty {
                speakButton
            }
        }
        .font(ChatFontScale.caption(chatFontScale))
        .foregroundStyle(ScarfColor.foregroundFaint)
        .padding(.leading, 4)
    }
    /// Speaker glyph that toggles `AVSpeechSynthesizer` playback for
    /// the assistant reply. Lives in its own view so the
    /// `MessageSpeechService` observation doesn't fight the bubble's
    /// `Equatable` short-circuit — the parent only needs to pass
    /// stable id + content; this view re-renders on its own when
    /// playback state flips.
    private var speakButton: some View {
        SpeakMessageButton(messageId: message.id, content: message.content)
    }
 }
 /// Stand-alone speaker button so the `MessageSpeechService`
 /// observation doesn't get short-circuited by `RichMessageBubble`'s
 /// `Equatable`. Only the button re-renders when playback flips —
 /// the bubble itself stays optimised.
 private struct SpeakMessageButton: View {
    let messageId: Int
    let content: String
    @State private var speech = MessageSpeechService.shared
    var body: some View {
        let isPlaying = speech.playingMessageId == messageId
        Button {
            speech.toggle(messageId: messageId, content: content)
        } label: {
            Image(systemName: isPlaying ? "stop.circle.fill" : "speaker.wave.2")
                .font(.system(size: 11))
                .foregroundStyle(isPlaying ? ScarfColor.accent : ScarfColor.foregroundFaint)
        }
        .buttonStyle(.plain)
        .help(isPlaying ? "Stop speaking" : "Read this reply aloud")
    }
 }
 // MARK: - Content Block Parsing