feat(chat): per-message TTS playback in assistant bubbles (#66)

Adds a small speaker glyph to the metadata footer of each settled
assistant bubble. Tap to read the reply aloud through
`AVSpeechSynthesizer`; tap again (or any other bubble's button) to
stop. Picks up the user's macOS Spoken Content default voice
automatically — no Hermes dependency, works offline.

- New `MessageSpeechService` (`Core/Services/`) — shared
  `@Observable` synthesizer; `playingMessageId` drives icon
  state. Markdown control characters (asterisks, backticks,
  link syntax) are stripped before speech so the user doesn't
  hear "asterisk asterisk bold".
- `SpeakMessageButton` lives outside `RichMessageBubble.==` so
  the bubble's Equatable short-circuit doesn't freeze the icon
  when playback flips between messages.

The full Hermes-provider TTS pipeline (Edge / ElevenLabs /
OpenAI / NeuTTS / Piper from Settings → Voice) is a much bigger
follow-up — wiring per-provider audio fetching, caching, and
streamed playback is its own quarter. v2.6.0 ships the immediate
"listen while doing something else" affordance.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alan Wizemann
2026-05-01 15:38:22 +02:00
parent 596c844da5
commit 254af46e93
2 changed files with 151 additions and 0 deletions
@@ -0,0 +1,110 @@
import Foundation
import AVFoundation
import os
import Observation
/// Per-message text-to-speech for assistant chat replies (issue #66).
/// Uses `AVSpeechSynthesizer` with the system voice no Hermes
/// dependency, works offline, picks up the user's macOS Spoken Content
/// voice selection automatically.
///
/// One synthesizer is shared across the app so starting a second
/// message's playback automatically interrupts the first. The
/// per-message speaker button reads `playingMessageId` to render
/// play vs. stop state.
///
/// The full Hermes-provider TTS pipeline (Edge / ElevenLabs / OpenAI
/// / NeuTTS / Piper from Settings Voice) is deferred to a follow-up
/// wiring per-provider audio fetching, caching, and interruption
/// is a much bigger surface than what's needed to give users a
/// listen-while-doing-other-work affordance today.
@MainActor
@Observable
final class MessageSpeechService: NSObject {
static let shared = MessageSpeechService()
/// The message id currently being spoken, or `nil` when idle.
/// Bubbles read this to flip their speaker icon to a stop glyph.
private(set) var playingMessageId: Int?
private let synthesizer = AVSpeechSynthesizer()
private let logger = Logger(subsystem: "com.scarf", category: "MessageSpeech")
private override init() {
super.init()
synthesizer.delegate = self
}
/// Speak `content`. If a different message is currently playing,
/// interrupt it. If the same message is currently playing, this
/// stops playback (toggle behavior).
func toggle(messageId: Int, content: String) {
if playingMessageId == messageId {
stop()
return
}
if synthesizer.isSpeaking {
synthesizer.stopSpeaking(at: .immediate)
}
let cleaned = Self.strippedForSpeech(content)
guard !cleaned.isEmpty else { return }
let utterance = AVSpeechUtterance(string: cleaned)
// AVSpeechUtterance honors the user's Spoken Content default
// voice when `voice` is `nil`, which is the right behavior:
// users who configured a specific macOS voice get it
// automatically.
utterance.rate = AVSpeechUtteranceDefaultSpeechRate
playingMessageId = messageId
synthesizer.speak(utterance)
}
/// Stop any in-progress speech and clear `playingMessageId`.
func stop() {
guard playingMessageId != nil else { return }
synthesizer.stopSpeaking(at: .immediate)
playingMessageId = nil
}
/// Strip markdown control characters before speech so the user
/// doesn't hear "asterisk asterisk bold". Code fences and inline
/// code are spoken verbatim minus the backticks. Keeps URLs
/// readable but drops square-bracket link wrappers.
static func strippedForSpeech(_ raw: String) -> String {
var out = raw
// Fenced code blocks keep contents
out = out.replacingOccurrences(of: "```", with: "")
// Inline code drop backticks
out = out.replacingOccurrences(of: "`", with: "")
// Bold/italic markers
out = out.replacingOccurrences(of: "**", with: "")
out = out.replacingOccurrences(of: "__", with: "")
// Link syntax: [text](url) text
if let regex = try? NSRegularExpression(
pattern: #"\[([^\]]+)\]\([^)]+\)"#,
options: []
) {
let range = NSRange(out.startIndex..., in: out)
out = regex.stringByReplacingMatches(
in: out,
options: [],
range: range,
withTemplate: "$1"
)
}
return out.trimmingCharacters(in: .whitespacesAndNewlines)
}
}
extension MessageSpeechService: @preconcurrency AVSpeechSynthesizerDelegate {
nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
Task { @MainActor in
self.playingMessageId = nil
}
}
nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
Task { @MainActor in
self.playingMessageId = nil
}
}
}
@@ -364,11 +364,52 @@ struct RichMessageBubble: View, Equatable {
.font(ChatFontScale.monoSmall(chatFontScale))
.help("Wall-clock duration of this turn")
}
// Per-message TTS playback toggle (issue #66). Only on
// settled assistant bubbles streaming bubble (id == 0)
// would speak partial text. Empty content has nothing to
// speak.
if message.id != 0, !message.content.isEmpty {
speakButton
}
}
.font(ChatFontScale.caption(chatFontScale))
.foregroundStyle(ScarfColor.foregroundFaint)
.padding(.leading, 4)
}
/// Speaker glyph that toggles `AVSpeechSynthesizer` playback for
/// the assistant reply. Lives in its own view so the
/// `MessageSpeechService` observation doesn't fight the bubble's
/// `Equatable` short-circuit the parent only needs to pass
/// stable id + content; this view re-renders on its own when
/// playback state flips.
private var speakButton: some View {
SpeakMessageButton(messageId: message.id, content: message.content)
}
}
/// Stand-alone speaker button so the `MessageSpeechService`
/// observation doesn't get short-circuited by `RichMessageBubble`'s
/// `Equatable`. Only the button re-renders when playback flips
/// the bubble itself stays optimised.
private struct SpeakMessageButton: View {
let messageId: Int
let content: String
@State private var speech = MessageSpeechService.shared
var body: some View {
let isPlaying = speech.playingMessageId == messageId
Button {
speech.toggle(messageId: messageId, content: content)
} label: {
Image(systemName: isPlaying ? "stop.circle.fill" : "speaker.wave.2")
.font(.system(size: 11))
.foregroundStyle(isPlaying ? ScarfColor.accent : ScarfColor.foregroundFaint)
}
.buttonStyle(.plain)
.help(isPlaying ? "Stop speaking" : "Read this reply aloud")
}
}
// MARK: - Content Block Parsing