OpenVoiceOS · JarbasAl · Jun 20, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/ovos_dinkum_listener/service.py b/ovos_dinkum_listener/service.py
@@ -18,7 +18,7 @@
 from hashlib import md5
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from threading import Thread, RLock, Event, Timer
+from threading import Thread, RLock, Event
 
 import speech_recognition as sr
 from distutils.spawn import find_executable
@@ -254,7 +254,7 @@ def _init_voice_loop(self, listener_config: dict):
                 fallback_stt=self.fallback_stt,
                 vad=self.vad,
                 transformers=self.transformers,
-                #
+                instant_listen=listener_config.get("instant_listen"),
                 speech_seconds=listener_config.get("speech_begin", 0.3),
                 silence_seconds=listener_config.get("silence_end", 0.7),
                 timeout_seconds=listener_config.get("recording_timeout", 10),
@@ -611,19 +611,10 @@ def _hotword_audio(self, audio_bytes: bytes, ww_context: dict):
             event = ww_context.get("event")
 
             if sound:
-                context = {'client_name': 'ovos_dinkum_listener',
-                           'source': 'listener',
-                           'destination': ["audio"]  # default native-source
-                           }
                 LOG.debug(f"Handling listen sound: {sound}")
                 self.bus.emit(Message("mycroft.audio.play_sound",
                                       {"uri": sound, "force_unmute": True},
                                       context))
-                if not listener.get("instant_listen"):
-                    self.voice_loop.state = ListeningState.CONFIRMATION
-                    self.voice_loop.confirmation_event.clear()
-                    Timer(0.5, lambda: self.voice_loop.confirmation_event.set()).start()
-
             if listen:
                 msg_type = "recognizer_loop:wakeword"
                 payload["utterance"] = \
@@ -788,12 +779,8 @@ def _handle_listen(self, message: Message):
                            }
                 message = message or Message("", context=context)  # might be None
                 self.bus.emit(message.forward("mycroft.audio.play_sound", {"uri": sound}))
-                if not self.config["listener"].get("instant_listen"):
-                    self.voice_loop.state = ListeningState.CONFIRMATION
-                    self.voice_loop.confirmation_event.clear()
-                    Timer(0.5, lambda: self.voice_loop.confirmation_event.set()).start()
-                else:
-                    self.voice_loop.state = ListeningState.BEFORE_COMMAND
+                self.voice_loop.state = ListeningState.CONFIRMATION
+                self.voice_loop.confirmation_seconds_left = self.voice_loop.confirmation_seconds
         else:
             self.voice_loop.state = ListeningState.BEFORE_COMMAND
 
@@ -880,7 +867,7 @@ def _handle_wake_up(self, message: Message):
     def _handle_sound_played(self, message: Message):
         """Handle response message from audio service."""
         if self.voice_loop.state == ListeningState.CONFIRMATION:
-            self.voice_loop.confirmation_event.set()
+            self.voice_loop.state = ListeningState.BEFORE_COMMAND
 
     def _handle_b64_audio(self, message: Message):
         """ transcribe base64 encoded audio """

diff --git a/ovos_dinkum_listener/voice_loop/voice_loop.py b/ovos_dinkum_listener/voice_loop/voice_loop.py
@@ -92,6 +92,7 @@ def debiased_energy(audio_data: bytes, sample_width: int) -> float:
 @dataclass
 class ChunkInfo:
     is_speech: bool = False
+    is_listen_sound: bool = False
     energy: float = 0.0
 
 
@@ -106,17 +107,20 @@ class DinkumVoiceLoop(VoiceLoop):
     speech_seconds: float = 0.3
     silence_seconds: float = 0.7
     timeout_seconds: float = 10.0
-    timeout_seconds_with_silence: float = 5.0    
+    timeout_seconds_with_silence: float = 5.0
+    confirmation_seconds: float = 0.5  # TODO - can we determine dynamically based on sound file ?
     num_stt_rewind_chunks: int = 2
     num_hotword_keep_chunks: int = 15
     remove_silence: bool = False
+    instant_listen: bool = False
     skip_next_wake: bool = False
     hotword_chunks: Deque = field(default_factory=deque)
     stt_chunks: Deque = field(default_factory=deque)
     stt_audio_bytes: bytes = bytes()
     last_ww: float = -1.0
     speech_seconds_left: float = 0.0
     silence_seconds_left: float = 0.0
+    confirmation_seconds_left: float = 0.0
     timeout_seconds_left: float = 0.0
     timeout_seconds_with_silence_left: float = 0.0
     recording_seconds_with_silence_left: float = 0.0
@@ -181,7 +185,6 @@ def run(self):
         self.timeout_seconds_left = self.timeout_seconds
         self.timeout_seconds_with_silence_left = self.timeout_seconds_with_silence        
         self.state = ListeningState.DETECT_WAKEWORD
-        self.confirmation_event = Event()
 
         # Keep hotword/STT audio so they can (optionally) be saved to disk
         self.hotword_chunks = deque(maxlen=self.num_hotword_keep_chunks)
@@ -248,12 +251,10 @@ def run(self):
                 self._before_wakeup(chunk)
             elif self.state == ListeningState.CHECK_WAKE_UP:
                 self._detect_wakeup(chunk)
-
-            # set either by timeout (0.5) or by ovos-audio response
-            elif self.state == ListeningState.CONFIRMATION and \
-                    self.confirmation_event.is_set():
-                self.state = ListeningState.BEFORE_COMMAND
-                LOG.debug(f"STATE: {self.state}")
+
+            elif self.state == ListeningState.CONFIRMATION:
+                LOG.debug("playing listen sound")
+                self._confirmation_sound(chunk)
 
             elif self.state == ListeningState.BEFORE_COMMAND:
                 LOG.debug("waiting for speech")
@@ -485,14 +486,15 @@ def _detect_ww(self, chunk: bytes) -> bool:
         ww = self.hotwords.found()
         if ww:
             LOG.debug(f"Wake word detected={ww}")
+            ww_data = self.hotwords.get_ww(ww)
+
             # Callback to handle recorded hotword audio
             if self.listenword_audio_callback is not None:
                 hotword_audio_bytes = bytes()
                 while self.hotword_chunks:
                     hotword_audio_bytes += self.hotword_chunks.popleft()
 
-                self.listenword_audio_callback(hotword_audio_bytes,
-                                               self.hotwords.get_ww(ww))
+                self.listenword_audio_callback(hotword_audio_bytes, ww_data)
 
             self.hotword_chunks.clear()
 
@@ -506,10 +508,12 @@ def _detect_ww(self, chunk: bytes) -> bool:
                 self.state = ListeningState.CHECK_WAKE_UP
                 LOG.debug(f"STATE: {self.state}")
             else:
-                # Wake word detected, begin recording voice command
-                if not self.state == ListeningState.CONFIRMATION:
+                if ww_data.get("sound"):
+                    self.state = ListeningState.CONFIRMATION
+                    self.confirmation_seconds_left = self.confirmation_seconds
+                else:
                     self.state = ListeningState.BEFORE_COMMAND
-                    LOG.debug(f"STATE: {self.state}")
+                # Wake word detected, begin recording voice command
                 self.reset_speech_timer()
                 self.stt_audio_bytes = bytes()
                 self.stt.stream_start()
@@ -557,6 +561,21 @@ def _wait_cmd(self, chunk: bytes):
                 self.stt_audio_bytes += chunk
                 self.stt_chunks.append(chunk)
 
+    def _confirmation_sound(self, chunk: bytes):
+        self._chunk_info.is_listen_sound = True
+        if self.instant_listen:
+            self.confirmation_seconds_left = 0
+            self.state = ListeningState.BEFORE_COMMAND
+            self._before_cmd(chunk)
+            return
+
+        # skip STT buffer if instant_listen is NOT set
+        # Recording voice command, but user has not spoken yet
+        self.transformers.feed_audio(chunk)
+        self.confirmation_seconds_left -= self.mic.seconds_per_chunk
+        if self.confirmation_seconds_left <= 0:
+            self.state = ListeningState.BEFORE_COMMAND
+
     def _before_cmd(self, chunk: bytes):
         """
         Handle audio chunks after WW detection or listen triggered, before VAD

diff --git a/test/unittests/test_service.py b/test/unittests/test_service.py
@@ -294,9 +294,6 @@ def test_handle_listen(self):
         self.assertEqual(self.service.voice_loop.stt_audio_bytes, bytes())
         self.service.voice_loop.stt.stream_start.assert_called_once()
         self.service.voice_loop.stt.stream_start.reset_mock()
-        self.assertEqual(self.service.voice_loop.state, ListeningState.CONFIRMATION)
-        sleep(1)
-        self.assertEqual(self.service.voice_loop.confirmation_event.is_set(), True)
 
         self.service.voice_loop.state = ListeningState.DETECT_WAKEWORD
         self.service.config["confirm_listening"] = False