Skip to content
98 changes: 56 additions & 42 deletions source/synthDrivers/sapi5.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def ISequentialStream_RemoteWrite(
if (
synth._isFirstAudioChunk
and synth.sonicStream.samplesAvailable
< synth.sonicStream.sampleRate * 1000 // _FIRST_AUDIO_CHUNK_MIN_DURATION_MS
< synth.sonicStream.sampleRate * _FIRST_AUDIO_CHUNK_MIN_DURATION_MS // 1000
):
return
synth._isFirstAudioChunk = False
Expand Down Expand Up @@ -504,28 +504,19 @@ def Bookmark(self, streamNum: int, pos: int, bookmark: str, bookmarkId: int):
def EndStream(self, streamNum: int, pos: int):
synth = self.synthRef()
if synth._isCancelling:
synth._bookmarkLists.clear()
return
if synth.player:
# Flush the stream and get the remaining data.
synth.sonicStream.flush()
audioData = synth.sonicStream.readShort()
synth.player.feed(audioData, len(audioData) * 2)
# trigger all untriggered bookmarks
if synth._bookmarkLists:
for bookmark in synth._bookmarkLists[0]:
synthIndexReached.notify(synth=synth, index=bookmark)
synth._bookmarkLists.pop()
synth._isSpeaking = False
synthDoneSpeaking.notify(synth=synth)
if synth.player:
# notify the thread
# WASAPI is on
# Notify the thread
# Handle EndStream in that thread
with synth._threadCond:
synth._isCompleted = True
synth._threadCond.notify()
if synth._audioDucker:
if audioDucking._isDebug():
log.debug("Disabling audio ducking due to speech stream end")
synth._audioDucker.disable()
else:
# WASAPI is off
# Handle EndStream immediately
synth._onEndStream()

def onIndexReached(self, streamNum: int, index: int):
synth = self.synthRef()
Expand Down Expand Up @@ -598,26 +589,31 @@ def __init__(self, _defaultVoiceToken=None):
This variable is not doing anything useful, and may be removed together with all its references
when the property "isSpeaking" is removed."""
self._isCancelling = False
self._isTerminating = False
self._isStoppingThread = False
self._isFirstAudioChunk = False
self._rateBoost = False
self._initTts(_defaultVoiceToken)
self._bookmarkLists: deque[deque[int]] = deque()
self._thread = threading.Thread(target=self._speakThread, name="Sapi5SpeakThread")
self._thread: threading.Thread | None = None
self._threadCond = threading.Condition()
self._speakRequests: deque[_SpeakRequest] = deque()
self._isCompleted = False # True when the last speak request reaches EndStream
self._cancellationCond = threading.Condition() # used to wait for cancellation to complete
self._thread.start()
self._initTts(_defaultVoiceToken)

def terminate(self):
def _stopThread(self) -> None:
"""Stops the WASAPI speak thread (if it's running) and waits for the thread to quit."""
self._isStoppingThread = True
# Wake up and wait for the speak thread.
self._isTerminating = True
if self.player:
self.player.stop() # Ensure the player is stopped to avoid blocking the thread.
with self._threadCond:
self._threadCond.notify_all()
self._thread.join()
if self._thread and self._thread.is_alive():
with self._threadCond:
self._threadCond.notify_all()
self._thread.join()
self._thread = None
self._isStoppingThread = False

def terminate(self):
self._stopThread()
self.tts = None
if self.player:
self.player.close()
Expand Down Expand Up @@ -724,6 +720,9 @@ def _initWasapiAudio(self):
sonicInitialize()
self.sonicStream = SonicStream(wfx.nSamplesPerSec, wfx.nChannels)

self._thread = threading.Thread(target=self._speakThread, name="Sapi5SpeakThread")
self._thread.start()

def _initLegacyAudio(self):
if audioDucking.isAudioDuckingSupported():
self._audioDucker = audioDucking.AudioDucker()
Expand All @@ -744,6 +743,7 @@ def _initTts(self, voice: str | None = None):
# Otherwise, we will get poor speech quality in some cases.
self.tts.Voice = voice

self._stopThread()
if self.player:
self.player.close()
self.player = None
Expand Down Expand Up @@ -814,12 +814,27 @@ def _convertPhoneme(self, ipa):
return " ".join(out)

def _requestsAvailable(self) -> bool:
return self._speakRequests or self._isCancelling or self._isTerminating
return self._speakRequests or self._isCancelling or self._isStoppingThread

def _requestCompleted(self) -> bool:
return self._isCompleted or self._isCancelling or self._isTerminating
return self._isCompleted or self._isCancelling or self._isStoppingThread

def _onEndStream(self) -> None:
"""Common handling when a speech stream ends."""
# trigger all untriggered bookmarks
if self._bookmarkLists:
for bookmark in self._bookmarkLists[0]:
synthIndexReached.notify(synth=self, index=bookmark)
self._bookmarkLists.pop()
self._isSpeaking = False
synthDoneSpeaking.notify(synth=self)
if self._audioDucker:
if audioDucking._isDebug():
log.debug("Disabling audio ducking due to speech stream end")
self._audioDucker.disable()

def _speakThread(self):
"""Thread that processes speech when WASAPI is enabled."""
# Handles speak requests in the queue one by one.
# Only one request will be processed (spoken) at a time.
# We don't use SAPI5's built-in speech queue,
Expand All @@ -832,7 +847,7 @@ def _speakThread(self):
request: _SpeakRequest | None = None

# Process requests one by one.
while not self._isTerminating:
while not self._isStoppingThread:
# Fetch the next request
with self._threadCond:
self._threadCond.wait_for(self._requestsAvailable)
Expand All @@ -847,6 +862,12 @@ def _speakThread(self):
self.tts.Speak(text, SpeechVoiceSpeakFlags.IsXML | SpeechVoiceSpeakFlags.Async)
with self._threadCond:
self._threadCond.wait_for(self._requestCompleted)
if not self._isCancelling:
# Flush the stream and play the remaining data.
self.sonicStream.flush()
audioData = self.sonicStream.readShort()
self.player.feed(audioData, len(audioData) * 2)
self._onEndStream()
except Exception:
self._bookmarkLists.pop()
log.error("Error speaking", exc_info=True)
Expand All @@ -856,16 +877,11 @@ def _speakThread(self):
self.player.idle()
if self._isCancelling:
self.tts.Speak(None, SpeechVoiceSpeakFlags.Async | SpeechVoiceSpeakFlags.PurgeBeforeSpeak)
# clear the queue
with self._threadCond:
self._speakRequests.clear()
self._bookmarkLists.clear()
if self.sonicStream:
self.sonicStream.flush()
self.sonicStream.readShort() # discard data left in stream
with self._cancellationCond:
self._isCancelling = False
self._cancellationCond.notify_all()
self._isCancelling = False

def speak(self, speechSequence):
textList = []
Expand Down Expand Up @@ -1030,11 +1046,9 @@ def cancel(self):
self._isCancelling = True
if self.player:
self.player.stop() # stop the audio and stop waiting for idle()
with self._threadCond: # wake up the thread
with self._threadCond: # clear the queue and wake up the thread
self._speakRequests.clear()
self._threadCond.notify()
with self._cancellationCond: # wait for cancellation to complete
while self._isCancelling:
self._cancellationCond.wait()
if self.ttsAudioStream:
# For legacy audio
# SAPI5's default means of stopping speech can sometimes lag at end of speech, especially with Win8 / Win 10 Microsoft Voices.
Expand Down Expand Up @@ -1083,4 +1097,4 @@ def __getattr__(self, attrName: str) -> Any:
# and all its references can also be removed,
# as it is not doing anything useful.
return self._isSpeaking
return super().__getattr__(attrName)
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attrName}'")