Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix/confirmation_state #125

Merged
merged 15 commits into from
Jun 20, 2024
23 changes: 5 additions & 18 deletions ovos_dinkum_listener/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from hashlib import md5
from pathlib import Path
from tempfile import NamedTemporaryFile
from threading import Thread, RLock, Event, Timer
from threading import Thread, RLock, Event

import speech_recognition as sr
from distutils.spawn import find_executable
Expand Down Expand Up @@ -254,7 +254,7 @@ def _init_voice_loop(self, listener_config: dict):
fallback_stt=self.fallback_stt,
vad=self.vad,
transformers=self.transformers,
#
instant_listen=listener_config.get("instant_listen"),
speech_seconds=listener_config.get("speech_begin", 0.3),
silence_seconds=listener_config.get("silence_end", 0.7),
timeout_seconds=listener_config.get("recording_timeout", 10),
Expand Down Expand Up @@ -611,19 +611,10 @@ def _hotword_audio(self, audio_bytes: bytes, ww_context: dict):
event = ww_context.get("event")

if sound:
context = {'client_name': 'ovos_dinkum_listener',
'source': 'listener',
'destination': ["audio"] # default native-source
}
JarbasAl marked this conversation as resolved.
Show resolved Hide resolved
LOG.debug(f"Handling listen sound: {sound}")
self.bus.emit(Message("mycroft.audio.play_sound",
{"uri": sound, "force_unmute": True},
context))
if not listener.get("instant_listen"):
self.voice_loop.state = ListeningState.CONFIRMATION
self.voice_loop.confirmation_event.clear()
Timer(0.5, lambda: self.voice_loop.confirmation_event.set()).start()

if listen:
msg_type = "recognizer_loop:wakeword"
payload["utterance"] = \
Expand Down Expand Up @@ -788,12 +779,8 @@ def _handle_listen(self, message: Message):
}
message = message or Message("", context=context) # might be None
self.bus.emit(message.forward("mycroft.audio.play_sound", {"uri": sound}))
if not self.config["listener"].get("instant_listen"):
self.voice_loop.state = ListeningState.CONFIRMATION
self.voice_loop.confirmation_event.clear()
Timer(0.5, lambda: self.voice_loop.confirmation_event.set()).start()
else:
self.voice_loop.state = ListeningState.BEFORE_COMMAND
self.voice_loop.state = ListeningState.CONFIRMATION
self.voice_loop.confirmation_seconds_left = self.voice_loop.confirmation_seconds
else:
self.voice_loop.state = ListeningState.BEFORE_COMMAND

Expand Down Expand Up @@ -880,7 +867,7 @@ def _handle_wake_up(self, message: Message):
def _handle_sound_played(self, message: Message):
"""Handle response message from audio service."""
if self.voice_loop.state == ListeningState.CONFIRMATION:
self.voice_loop.confirmation_event.set()
self.voice_loop.state = ListeningState.BEFORE_COMMAND

def _handle_b64_audio(self, message: Message):
""" transcribe base64 encoded audio """
Expand Down
45 changes: 32 additions & 13 deletions ovos_dinkum_listener/voice_loop/voice_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def debiased_energy(audio_data: bytes, sample_width: int) -> float:
@dataclass
class ChunkInfo:
is_speech: bool = False
is_listen_sound: bool = False
energy: float = 0.0


Expand All @@ -106,17 +107,20 @@ class DinkumVoiceLoop(VoiceLoop):
speech_seconds: float = 0.3
silence_seconds: float = 0.7
timeout_seconds: float = 10.0
timeout_seconds_with_silence: float = 5.0
timeout_seconds_with_silence: float = 5.0
confirmation_seconds: float = 0.5 # TODO - can we determine dynamically based on sound file ?
num_stt_rewind_chunks: int = 2
num_hotword_keep_chunks: int = 15
remove_silence: bool = False
instant_listen: bool = False
skip_next_wake: bool = False
hotword_chunks: Deque = field(default_factory=deque)
stt_chunks: Deque = field(default_factory=deque)
stt_audio_bytes: bytes = bytes()
last_ww: float = -1.0
speech_seconds_left: float = 0.0
silence_seconds_left: float = 0.0
confirmation_seconds_left: float = 0.0
timeout_seconds_left: float = 0.0
timeout_seconds_with_silence_left: float = 0.0
recording_seconds_with_silence_left: float = 0.0
Expand Down Expand Up @@ -181,7 +185,6 @@ def run(self):
self.timeout_seconds_left = self.timeout_seconds
self.timeout_seconds_with_silence_left = self.timeout_seconds_with_silence
self.state = ListeningState.DETECT_WAKEWORD
self.confirmation_event = Event()

# Keep hotword/STT audio so they can (optionally) be saved to disk
self.hotword_chunks = deque(maxlen=self.num_hotword_keep_chunks)
Expand Down Expand Up @@ -248,12 +251,10 @@ def run(self):
self._before_wakeup(chunk)
elif self.state == ListeningState.CHECK_WAKE_UP:
self._detect_wakeup(chunk)

# set either by timeout (0.5) or by ovos-audio response
elif self.state == ListeningState.CONFIRMATION and \
self.confirmation_event.is_set():
self.state = ListeningState.BEFORE_COMMAND
LOG.debug(f"STATE: {self.state}")

elif self.state == ListeningState.CONFIRMATION:
LOG.debug("playing listen sound")
self._confirmation_sound(chunk)

elif self.state == ListeningState.BEFORE_COMMAND:
LOG.debug("waiting for speech")
Expand Down Expand Up @@ -485,14 +486,15 @@ def _detect_ww(self, chunk: bytes) -> bool:
ww = self.hotwords.found()
if ww:
LOG.debug(f"Wake word detected={ww}")
ww_data = self.hotwords.get_ww(ww)

# Callback to handle recorded hotword audio
if self.listenword_audio_callback is not None:
hotword_audio_bytes = bytes()
while self.hotword_chunks:
hotword_audio_bytes += self.hotword_chunks.popleft()

self.listenword_audio_callback(hotword_audio_bytes,
self.hotwords.get_ww(ww))
self.listenword_audio_callback(hotword_audio_bytes, ww_data)

self.hotword_chunks.clear()

Expand All @@ -506,10 +508,12 @@ def _detect_ww(self, chunk: bytes) -> bool:
self.state = ListeningState.CHECK_WAKE_UP
LOG.debug(f"STATE: {self.state}")
else:
# Wake word detected, begin recording voice command
if not self.state == ListeningState.CONFIRMATION:
if ww_data.get("sound"):
self.state = ListeningState.CONFIRMATION
self.confirmation_seconds_left = self.confirmation_seconds
else:
self.state = ListeningState.BEFORE_COMMAND
LOG.debug(f"STATE: {self.state}")
# Wake word detected, begin recording voice command
self.reset_speech_timer()
self.stt_audio_bytes = bytes()
self.stt.stream_start()
Expand Down Expand Up @@ -557,6 +561,21 @@ def _wait_cmd(self, chunk: bytes):
self.stt_audio_bytes += chunk
self.stt_chunks.append(chunk)

def _confirmation_sound(self, chunk: bytes):
self._chunk_info.is_listen_sound = True
if self.instant_listen:
self.confirmation_seconds_left = 0
self.state = ListeningState.BEFORE_COMMAND
self._before_cmd(chunk)
return

# skip STT buffer if instant_listen is NOT set
# Recording voice command, but user has not spoken yet
self.transformers.feed_audio(chunk)
self.confirmation_seconds_left -= self.mic.seconds_per_chunk
if self.confirmation_seconds_left <= 0:
self.state = ListeningState.BEFORE_COMMAND

def _before_cmd(self, chunk: bytes):
"""
Handle audio chunks after WW detection or listen triggered, before VAD
Expand Down
3 changes: 0 additions & 3 deletions test/unittests/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,9 +294,6 @@ def test_handle_listen(self):
self.assertEqual(self.service.voice_loop.stt_audio_bytes, bytes())
self.service.voice_loop.stt.stream_start.assert_called_once()
self.service.voice_loop.stt.stream_start.reset_mock()
self.assertEqual(self.service.voice_loop.state, ListeningState.CONFIRMATION)
sleep(1)
self.assertEqual(self.service.voice_loop.confirmation_event.is_set(), True)

self.service.voice_loop.state = ListeningState.DETECT_WAKEWORD
self.service.config["confirm_listening"] = False
Expand Down
Loading