mirror of
https://github.com/awizemann/scarf.git
synced 2026-05-10 10:36:35 +00:00
feat(chat): per-message TTS playback in assistant bubbles (#66)
Adds a small speaker glyph to the metadata footer of each settled assistant bubble. Tap to read the reply aloud through `AVSpeechSynthesizer`; tap again (or any other bubble's button) to stop. Picks up the user's macOS Spoken Content default voice automatically — no Hermes dependency, works offline. - New `MessageSpeechService` (`Core/Services/`) — shared `@Observable` synthesizer; `playingMessageId` drives icon state. Markdown control characters (asterisks, backticks, link syntax) are stripped before speech so the user doesn't hear "asterisk asterisk bold". - `SpeakMessageButton` lives outside `RichMessageBubble.==` so the bubble's Equatable short-circuit doesn't freeze the icon when playback flips between messages. The full Hermes-provider TTS pipeline (Edge / ElevenLabs / OpenAI / NeuTTS / Piper from Settings → Voice) is a much bigger follow-up — wiring per-provider audio fetching, caching, and streamed playback is its own quarter. v2.6.0 ships the immediate "listen while doing something else" affordance. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,110 @@
|
|||||||
|
import Foundation
|
||||||
|
import AVFoundation
|
||||||
|
import os
|
||||||
|
import Observation
|
||||||
|
|
||||||
|
/// Per-message text-to-speech for assistant chat replies (issue #66).
|
||||||
|
/// Uses `AVSpeechSynthesizer` with the system voice — no Hermes
|
||||||
|
/// dependency, works offline, picks up the user's macOS Spoken Content
|
||||||
|
/// voice selection automatically.
|
||||||
|
///
|
||||||
|
/// One synthesizer is shared across the app so starting a second
|
||||||
|
/// message's playback automatically interrupts the first. The
|
||||||
|
/// per-message speaker button reads `playingMessageId` to render
|
||||||
|
/// play vs. stop state.
|
||||||
|
///
|
||||||
|
/// The full Hermes-provider TTS pipeline (Edge / ElevenLabs / OpenAI
|
||||||
|
/// / NeuTTS / Piper from Settings → Voice) is deferred to a follow-up
|
||||||
|
/// — wiring per-provider audio fetching, caching, and interruption
|
||||||
|
/// is a much bigger surface than what's needed to give users a
|
||||||
|
/// listen-while-doing-other-work affordance today.
|
||||||
|
@MainActor
|
||||||
|
@Observable
|
||||||
|
final class MessageSpeechService: NSObject {
|
||||||
|
static let shared = MessageSpeechService()
|
||||||
|
|
||||||
|
/// The message id currently being spoken, or `nil` when idle.
|
||||||
|
/// Bubbles read this to flip their speaker icon to a stop glyph.
|
||||||
|
private(set) var playingMessageId: Int?
|
||||||
|
|
||||||
|
private let synthesizer = AVSpeechSynthesizer()
|
||||||
|
private let logger = Logger(subsystem: "com.scarf", category: "MessageSpeech")
|
||||||
|
|
||||||
|
private override init() {
|
||||||
|
super.init()
|
||||||
|
synthesizer.delegate = self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Speak `content`. If a different message is currently playing,
|
||||||
|
/// interrupt it. If the same message is currently playing, this
|
||||||
|
/// stops playback (toggle behavior).
|
||||||
|
func toggle(messageId: Int, content: String) {
|
||||||
|
if playingMessageId == messageId {
|
||||||
|
stop()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if synthesizer.isSpeaking {
|
||||||
|
synthesizer.stopSpeaking(at: .immediate)
|
||||||
|
}
|
||||||
|
let cleaned = Self.strippedForSpeech(content)
|
||||||
|
guard !cleaned.isEmpty else { return }
|
||||||
|
let utterance = AVSpeechUtterance(string: cleaned)
|
||||||
|
// AVSpeechUtterance honors the user's Spoken Content default
|
||||||
|
// voice when `voice` is `nil`, which is the right behavior:
|
||||||
|
// users who configured a specific macOS voice get it
|
||||||
|
// automatically.
|
||||||
|
utterance.rate = AVSpeechUtteranceDefaultSpeechRate
|
||||||
|
playingMessageId = messageId
|
||||||
|
synthesizer.speak(utterance)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stop any in-progress speech and clear `playingMessageId`.
|
||||||
|
func stop() {
|
||||||
|
guard playingMessageId != nil else { return }
|
||||||
|
synthesizer.stopSpeaking(at: .immediate)
|
||||||
|
playingMessageId = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Strip markdown control characters before speech so the user
|
||||||
|
/// doesn't hear "asterisk asterisk bold". Code fences and inline
|
||||||
|
/// code are spoken verbatim minus the backticks. Keeps URLs
|
||||||
|
/// readable but drops square-bracket link wrappers.
|
||||||
|
static func strippedForSpeech(_ raw: String) -> String {
|
||||||
|
var out = raw
|
||||||
|
// Fenced code blocks → keep contents
|
||||||
|
out = out.replacingOccurrences(of: "```", with: "")
|
||||||
|
// Inline code → drop backticks
|
||||||
|
out = out.replacingOccurrences(of: "`", with: "")
|
||||||
|
// Bold/italic markers
|
||||||
|
out = out.replacingOccurrences(of: "**", with: "")
|
||||||
|
out = out.replacingOccurrences(of: "__", with: "")
|
||||||
|
// Link syntax: [text](url) → text
|
||||||
|
if let regex = try? NSRegularExpression(
|
||||||
|
pattern: #"\[([^\]]+)\]\([^)]+\)"#,
|
||||||
|
options: []
|
||||||
|
) {
|
||||||
|
let range = NSRange(out.startIndex..., in: out)
|
||||||
|
out = regex.stringByReplacingMatches(
|
||||||
|
in: out,
|
||||||
|
options: [],
|
||||||
|
range: range,
|
||||||
|
withTemplate: "$1"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
return out.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extension MessageSpeechService: @preconcurrency AVSpeechSynthesizerDelegate {
|
||||||
|
nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
|
||||||
|
Task { @MainActor in
|
||||||
|
self.playingMessageId = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nonisolated func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
|
||||||
|
Task { @MainActor in
|
||||||
|
self.playingMessageId = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -364,11 +364,52 @@ struct RichMessageBubble: View, Equatable {
|
|||||||
.font(ChatFontScale.monoSmall(chatFontScale))
|
.font(ChatFontScale.monoSmall(chatFontScale))
|
||||||
.help("Wall-clock duration of this turn")
|
.help("Wall-clock duration of this turn")
|
||||||
}
|
}
|
||||||
|
// Per-message TTS playback toggle (issue #66). Only on
|
||||||
|
// settled assistant bubbles — streaming bubble (id == 0)
|
||||||
|
// would speak partial text. Empty content has nothing to
|
||||||
|
// speak.
|
||||||
|
if message.id != 0, !message.content.isEmpty {
|
||||||
|
speakButton
|
||||||
|
}
|
||||||
}
|
}
|
||||||
.font(ChatFontScale.caption(chatFontScale))
|
.font(ChatFontScale.caption(chatFontScale))
|
||||||
.foregroundStyle(ScarfColor.foregroundFaint)
|
.foregroundStyle(ScarfColor.foregroundFaint)
|
||||||
.padding(.leading, 4)
|
.padding(.leading, 4)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Speaker glyph that toggles `AVSpeechSynthesizer` playback for
|
||||||
|
/// the assistant reply. Lives in its own view so the
|
||||||
|
/// `MessageSpeechService` observation doesn't fight the bubble's
|
||||||
|
/// `Equatable` short-circuit — the parent only needs to pass
|
||||||
|
/// stable id + content; this view re-renders on its own when
|
||||||
|
/// playback state flips.
|
||||||
|
private var speakButton: some View {
|
||||||
|
SpeakMessageButton(messageId: message.id, content: message.content)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stand-alone speaker button so the `MessageSpeechService`
|
||||||
|
/// observation doesn't get short-circuited by `RichMessageBubble`'s
|
||||||
|
/// `Equatable`. Only the button re-renders when playback flips —
|
||||||
|
/// the bubble itself stays optimised.
|
||||||
|
private struct SpeakMessageButton: View {
|
||||||
|
let messageId: Int
|
||||||
|
let content: String
|
||||||
|
|
||||||
|
@State private var speech = MessageSpeechService.shared
|
||||||
|
|
||||||
|
var body: some View {
|
||||||
|
let isPlaying = speech.playingMessageId == messageId
|
||||||
|
Button {
|
||||||
|
speech.toggle(messageId: messageId, content: content)
|
||||||
|
} label: {
|
||||||
|
Image(systemName: isPlaying ? "stop.circle.fill" : "speaker.wave.2")
|
||||||
|
.font(.system(size: 11))
|
||||||
|
.foregroundStyle(isPlaying ? ScarfColor.accent : ScarfColor.foregroundFaint)
|
||||||
|
}
|
||||||
|
.buttonStyle(.plain)
|
||||||
|
.help(isPlaying ? "Stop speaking" : "Read this reply aloud")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// MARK: - Content Block Parsing
|
// MARK: - Content Block Parsing
|
||||||
|
|||||||
Reference in New Issue
Block a user