Addition of Pipeline Profiles (spokestack#22)

will-rice · web-flow · commit 89d48660a6e0 · 2020-10-13T10:09:57.000-04:00
* Addition of Pipeline Profiles

This changes introduces two profiles to the python library. Both profiles contain the speech pipeline including asr. The difference is in the activation trigger. WakewordASR uses the tflite wakeword for activation and VoiceActivityTriggerASR uses the vad to activate.
diff --git a/spokestack/activation_timeout.py b/spokestack/activation_timeout.py
@@ -13,7 +13,9 @@ class ActivationTimeout:
         max_active (int): the maximum length of an activation (ms)
     """
 
-    def __init__(self, frame_width=20, min_active=500, max_active=5000) -> None:
+    def __init__(
+        self, frame_width=20, min_active=500, max_active=5000, **kwargs
+    ) -> None:
 
         self._min_active = min_active / frame_width
         self._max_active = max_active / frame_width
diff --git a/spokestack/asr/speech_recognizer.py b/spokestack/asr/speech_recognizer.py
@@ -33,6 +33,7 @@ def __init__(
         sample_rate: int = 16000,
         frame_width: int = 20,
         idle_timeout: int = 5000,
+        **kwargs,
     ) -> None:
 
         self._client: CloudClient = CloudClient(
diff --git a/spokestack/io/pyaudio.py b/spokestack/io/pyaudio.py
@@ -15,7 +15,11 @@ class PyAudioInput:
     """
 
     def __init__(
-        self, sample_rate: int, frame_width: int, exception_on_overflow: bool = True
+        self,
+        sample_rate: int,
+        frame_width: int,
+        exception_on_overflow: bool = True,
+        **kwargs
     ) -> None:
         self._frame_size = int(sample_rate / 1000 * frame_width)
         self._exception_on_overflow = exception_on_overflow
diff --git a/spokestack/profile/__init__.py b/spokestack/profile/__init__.py
diff --git a/spokestack/profile/vad_trigger_asr.py b/spokestack/profile/vad_trigger_asr.py
@@ -0,0 +1,53 @@
+"""
+Pipeline profile with vad trigger and asr
+"""
+from spokestack.activation_timeout import ActivationTimeout
+from spokestack.asr.speech_recognizer import CloudSpeechRecognizer
+from spokestack.io.pyaudio import PyAudioInput
+from spokestack.pipeline import SpeechPipeline
+from spokestack.vad.webrtc import VoiceActivityDetector, VoiceActivityTrigger
+
+
+class VoiceActivityTriggerSpokestackASR:
+    """ VAD Trigger ASR """
+
+    @staticmethod
+    def create(
+        spokestack_id: str,
+        spokestack_secret: str,
+        sample_rate: int = 16000,
+        frame_width: int = 20,
+        **kwargs
+    ) -> SpeechPipeline:
+        """
+
+        Args:
+            spokestack_id (str): spokestack API id.
+            spokestack_secret (str): Spokestack API secret.
+            sample_rate (int): sample rate of the audio (Hz).
+            frame_width (int): width of the audio frame: 10, 20, or 30 (ms).
+
+        Returns:
+            SpeechPipeline instance with profile configuration.
+
+        """
+        pipeline = SpeechPipeline(
+            input_source=PyAudioInput(
+                sample_rate=sample_rate, frame_width=frame_width, **kwargs
+            ),
+            stages=[
+                VoiceActivityDetector(
+                    sample_rate=sample_rate, frame_width=frame_width, **kwargs
+                ),
+                VoiceActivityTrigger(),
+                ActivationTimeout(frame_width=frame_width, **kwargs),
+                CloudSpeechRecognizer(
+                    spokestack_id=spokestack_id,
+                    spokestack_secret=spokestack_secret,
+                    sample_rate=sample_rate,
+                    frame_width=frame_width,
+                    **kwargs
+                ),
+            ],
+        )
+        return pipeline
diff --git a/spokestack/profile/wakeword_asr.py b/spokestack/profile/wakeword_asr.py
@@ -0,0 +1,53 @@
+"""
+Pipeline profile for pyaudio input, vad, wakeword, and asr
+"""
+from spokestack.activation_timeout import ActivationTimeout
+from spokestack.asr.speech_recognizer import CloudSpeechRecognizer
+from spokestack.io.pyaudio import PyAudioInput
+from spokestack.pipeline import SpeechPipeline
+from spokestack.vad.webrtc import VoiceActivityDetector
+from spokestack.wakeword.tflite import WakewordTrigger
+
+
+class WakewordSpokestackASR:
+    """ TFLite wakeword with Spokestack speech recognition. """
+
+    @staticmethod
+    def create(
+        spokestack_id: str,
+        spokestack_secret: str,
+        sample_rate: int = 16000,
+        frame_width: int = 20,
+        model_dir: str = "",
+        **kwargs,
+    ) -> SpeechPipeline:
+        """ Creates a speech pipeline instance from profile
+
+        Args:
+            spokestack_id (str): spokestack API id.
+            spokestack_secret (str): Spokestack API secret.
+            sample_rate (int): sample rate of the audio (Hz).
+            frame_width (int): width of the audio frame: 10, 20, or 30 (ms).
+            model_dir (str): Directory containing the tflite wakeword models.
+
+        Returns:
+
+        """
+        pipeline = SpeechPipeline(
+            input_source=PyAudioInput(
+                frame_width=frame_width, sample_rate=sample_rate, **kwargs
+            ),
+            stages=[
+                VoiceActivityDetector(
+                    frame_width=frame_width, sample_rate=sample_rate, **kwargs,
+                ),
+                WakewordTrigger(model_dir=model_dir, **kwargs),
+                ActivationTimeout(frame_width=frame_width, **kwargs),
+                CloudSpeechRecognizer(
+                    spokestack_secret=spokestack_secret,
+                    spokestack_id=spokestack_id,
+                    **kwargs,
+                ),
+            ],
+        )
+        return pipeline
diff --git a/spokestack/vad/webrtc.py b/spokestack/vad/webrtc.py
@@ -37,6 +37,7 @@ def __init__(
         vad_rise_delay: int = 0,
         vad_fall_delay: int = 0,
         mode: int = QUALITY,
+        **kwargs
     ) -> None:
 
         self._sample_rate: int = sample_rate
diff --git a/spokestack/wakeword/tflite.py b/spokestack/wakeword/tflite.py
@@ -36,6 +36,7 @@ def __init__(
         fft_hop_length: int = 10,
         model_dir: str = "",
         posterior_threshold: float = 0.5,
+        **kwargs,
     ) -> None:
 
         self.pre_emphasis: float = pre_emphasis
diff --git a/tests/profile/test_vad_trigger_asr.py b/tests/profile/test_vad_trigger_asr.py
@@ -0,0 +1,14 @@
+"""
+This module contains the tests for vad_trigger_asr profile
+"""
+from unittest import mock
+
+from spokestack.profile.vad_trigger_asr import VoiceActivityTriggerSpokestackASR
+
+
+@mock.patch("spokestack.profile.vad_trigger_asr.PyAudioInput")
+@mock.patch("spokestack.profile.vad_trigger_asr.SpeechPipeline")
+def test_activate(*args):
+    pipeline = VoiceActivityTriggerSpokestackASR.create("", "")
+    pipeline.start()
+    pipeline.run()
diff --git a/tests/profile/test_wakeword_asr.py b/tests/profile/test_wakeword_asr.py
@@ -0,0 +1,15 @@
+"""
+This module contains the tests for the wakeword asr profile.
+"""
+from unittest import mock
+
+from spokestack.profile.wakeword_asr import WakewordSpokestackASR
+
+
+@mock.patch("spokestack.profile.wakeword_asr.PyAudioInput")
+@mock.patch("spokestack.profile.wakeword_asr.WakewordTrigger")
+@mock.patch("spokestack.profile.wakeword_asr.SpeechPipeline")
+def test_activate(*args):
+    pipeline = WakewordSpokestackASR.create("", "")
+    pipeline.start()
+    pipeline.run()
diff --git a/tests/wakeword/test_tflite.py b/tests/wakeword/test_tflite.py
@@ -72,6 +72,7 @@ def test_detect_inactive_vad_deactivate(_mock):
         detector(context, test_frame)
         context.is_speech = False
         assert not context.is_active
+    detector(context, test_frame)
 
 
 @mock.patch("spokestack.wakeword.tflite.TFLiteModel", new_callable=ModelFactory)