mirror of
https://github.com/awizemann/scarf.git
synced 2026-05-10 02:26:37 +00:00
feat(chat): per-message TTS playback in assistant bubbles (#66)
Adds a small speaker glyph to the metadata footer of each settled assistant bubble. Tap to read the reply aloud through `AVSpeechSynthesizer`; tap again (or any other bubble's button) to stop. Picks up the user's macOS Spoken Content default voice automatically — no Hermes dependency, works offline. - New `MessageSpeechService` (`Core/Services/`) — shared `@Observable` synthesizer; `playingMessageId` drives icon state. Markdown control characters (asterisks, backticks, link syntax) are stripped before speech so the user doesn't hear "asterisk asterisk bold". - `SpeakMessageButton` lives outside `RichMessageBubble.==` so the bubble's Equatable short-circuit doesn't freeze the icon when playback flips between messages. The full Hermes-provider TTS pipeline (Edge / ElevenLabs / OpenAI / NeuTTS / Piper from Settings → Voice) is a much bigger follow-up — wiring per-provider audio fetching, caching, and streamed playback is its own quarter. v2.6.0 ships the immediate "listen while doing something else" affordance. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,110 @@
|
||||
import Foundation
|
||||
import AVFoundation
|
||||
import os
|
||||
import Observation
|
||||
|
||||
/// Per-message text-to-speech for assistant chat replies (issue #66).
|
||||
/// Uses `AVSpeechSynthesizer` with the system voice — no Hermes
|
||||
/// dependency, works offline, picks up the user's macOS Spoken Content
|
||||
/// voice selection automatically.
|
||||
///
|
||||
/// One synthesizer is shared across the app so starting a second
|
||||
/// message's playback automatically interrupts the first. The
|
||||
/// per-message speaker button reads `playingMessageId` to render
|
||||
/// play vs. stop state.
|
||||
///
|
||||
/// The full Hermes-provider TTS pipeline (Edge / ElevenLabs / OpenAI
|
||||
/// / NeuTTS / Piper from Settings → Voice) is deferred to a follow-up
|
||||
/// — wiring per-provider audio fetching, caching, and interruption
|
||||
/// is a much bigger surface than what's needed to give users a
|
||||
/// listen-while-doing-other-work affordance today.
|
||||
@MainActor
|
||||
@Observable
|
||||
final class MessageSpeechService: NSObject {
|
||||
static let shared = MessageSpeechService()
|
||||
|
||||
/// The message id currently being spoken, or `nil` when idle.
|
||||
/// Bubbles read this to flip their speaker icon to a stop glyph.
|
||||
private(set) var playingMessageId: Int?
|
||||
|
||||
private let synthesizer = AVSpeechSynthesizer()
|
||||
private let logger = Logger(subsystem: "com.scarf", category: "MessageSpeech")
|
||||
|
||||
private override init() {
|
||||
super.init()
|
||||
synthesizer.delegate = self
|
||||
}
|
||||
|
||||
/// Speak `content`. If a different message is currently playing,
|
||||
/// interrupt it. If the same message is currently playing, this
|
||||
/// stops playback (toggle behavior).
|
||||
func toggle(messageId: Int, content: String) {
|
||||
if playingMessageId == messageId {
|
||||
stop()
|
||||
return
|
||||
}
|
||||
if synthesizer.isSpeaking {
|
||||
synthesizer.stopSpeaking(at: .immediate)
|
||||
}
|
||||
let cleaned = Self.strippedForSpeech(content)
|
||||
guard !cleaned.isEmpty else { return }
|
||||
let utterance = AVSpeechUtterance(string: cleaned)
|
||||
// AVSpeechUtterance honors the user's Spoken Content default
|
||||
// voice when `voice` is `nil`, which is the right behavior:
|
||||
// users who configured a specific macOS voice get it
|
||||
// automatically.
|
||||
utterance.rate = AVSpeechUtteranceDefaultSpeechRate
|
||||
playingMessageId = messageId
|
||||
synthesizer.speak(utterance)
|
||||
}
|
||||
|
||||
/// Stop any in-progress speech and clear `playingMessageId`.
|
||||
func stop() {
|
||||
guard playingMessageId != nil else { return }
|
||||
synthesizer.stopSpeaking(at: .immediate)
|
||||
playingMessageId = nil
|
||||
}
|
||||
|
||||
/// Strip markdown control characters before speech so the user
|
||||
/// doesn't hear "asterisk asterisk bold". Code fences and inline
|
||||
/// code are spoken verbatim minus the backticks. Keeps URLs
|
||||
/// readable but drops square-bracket link wrappers.
|
||||
static func strippedForSpeech(_ raw: String) -> String {
|
||||
var out = raw
|
||||
// Fenced code blocks → keep contents
|
||||
out = out.replacingOccurrences(of: "```", with: "")
|
||||
// Inline code → drop backticks
|
||||
out = out.replacingOccurrences(of: "`", with: "")
|
||||
// Bold/italic markers
|
||||
out = out.replacingOccurrences(of: "**", with: "")
|
||||
out = out.replacingOccurrences(of: "__", with: "")
|
||||
// Link syntax: [text](url) → text
|
||||
if let regex = try? NSRegularExpression(
|
||||
pattern: #"\[([^\]]+)\]\([^)]+\)"#,
|
||||
options: []
|
||||
) {
|
||||
let range = NSRange(out.startIndex..., in: out)
|
||||
out = regex.stringByReplacingMatches(
|
||||
in: out,
|
||||
options: [],
|
||||
range: range,
|
||||
withTemplate: "$1"
|
||||
)
|
||||
}
|
||||
return out.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
}
|
||||
}
|
||||
|
||||
extension MessageSpeechService: @preconcurrency AVSpeechSynthesizerDelegate {
|
||||
nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
|
||||
Task { @MainActor in
|
||||
self.playingMessageId = nil
|
||||
}
|
||||
}
|
||||
|
||||
nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
|
||||
Task { @MainActor in
|
||||
self.playingMessageId = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -364,11 +364,52 @@ struct RichMessageBubble: View, Equatable {
|
||||
.font(ChatFontScale.monoSmall(chatFontScale))
|
||||
.help("Wall-clock duration of this turn")
|
||||
}
|
||||
// Per-message TTS playback toggle (issue #66). Only on
|
||||
// settled assistant bubbles — streaming bubble (id == 0)
|
||||
// would speak partial text. Empty content has nothing to
|
||||
// speak.
|
||||
if message.id != 0, !message.content.isEmpty {
|
||||
speakButton
|
||||
}
|
||||
}
|
||||
.font(ChatFontScale.caption(chatFontScale))
|
||||
.foregroundStyle(ScarfColor.foregroundFaint)
|
||||
.padding(.leading, 4)
|
||||
}
|
||||
|
||||
/// Speaker glyph that toggles `AVSpeechSynthesizer` playback for
|
||||
/// the assistant reply. Lives in its own view so the
|
||||
/// `MessageSpeechService` observation doesn't fight the bubble's
|
||||
/// `Equatable` short-circuit — the parent only needs to pass
|
||||
/// stable id + content; this view re-renders on its own when
|
||||
/// playback state flips.
|
||||
private var speakButton: some View {
|
||||
SpeakMessageButton(messageId: message.id, content: message.content)
|
||||
}
|
||||
}
|
||||
|
||||
/// Stand-alone speaker button so the `MessageSpeechService`
|
||||
/// observation doesn't get short-circuited by `RichMessageBubble`'s
|
||||
/// `Equatable`. Only the button re-renders when playback flips —
|
||||
/// the bubble itself stays optimised.
|
||||
private struct SpeakMessageButton: View {
|
||||
let messageId: Int
|
||||
let content: String
|
||||
|
||||
@State private var speech = MessageSpeechService.shared
|
||||
|
||||
var body: some View {
|
||||
let isPlaying = speech.playingMessageId == messageId
|
||||
Button {
|
||||
speech.toggle(messageId: messageId, content: content)
|
||||
} label: {
|
||||
Image(systemName: isPlaying ? "stop.circle.fill" : "speaker.wave.2")
|
||||
.font(.system(size: 11))
|
||||
.foregroundStyle(isPlaying ? ScarfColor.accent : ScarfColor.foregroundFaint)
|
||||
}
|
||||
.buttonStyle(.plain)
|
||||
.help(isPlaying ? "Stop speaking" : "Read this reply aloud")
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Content Block Parsing
|
||||
|
||||
Reference in New Issue
Block a user