From b9409ca67d2a15b36717613052d9365655fcf2eb Mon Sep 17 00:00:00 2001 From: Markus Hennerbichler Date: Wed, 31 Jan 2024 12:07:39 +0000 Subject: [PATCH] Add support for Audio Events --- CHANGELOG.md | 6 ++++++ VERSION | 2 +- speechmatics/cli.py | 23 +++++++++++++++++++++-- speechmatics/cli_parser.py | 6 ++++++ speechmatics/client.py | 24 ++++++++++++++++++------ speechmatics/models.py | 28 +++++++++++++++++++++++++--- 6 files changed, 77 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2076944..54aeeb7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.14.0] - 2024-02-12 + +### Added + +- Support for the Audio Events feature + ## [1.13.1] - 2023-12-21 ### Changed diff --git a/VERSION b/VERSION index b50dd27..850e742 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.13.1 +1.14.0 diff --git a/speechmatics/cli.py b/speechmatics/cli.py index 03df295..f733c56 100755 --- a/speechmatics/cli.py +++ b/speechmatics/cli.py @@ -11,7 +11,7 @@ import sys from dataclasses import dataclass from socket import gaierror -from typing import List +from typing import Any, Dict, List import httpx import toml @@ -26,6 +26,7 @@ from speechmatics.exceptions import JobNotFoundException, TranscriptionError from speechmatics.helpers import _process_status_errors from speechmatics.models import ( + AudioEventsConfig, AudioSettings, AutoChaptersConfig, BatchLanguageIdentificationConfig, @@ -198,7 +199,7 @@ def get_transcription_config( config = json.load(config_file) else: # Ensure "en" is the default language as to not break existing API behavior. - config = {"language": "en"} + config: Dict[str, Any] = {"language": "en"} # transcription_config is flattened in the BatchTranscriptionConfig, # so the config entry from JSON must be flattened here, otherwise the JSON entry would be ignored @@ -341,6 +342,14 @@ def get_transcription_config( if args_auto_chapters or auto_chapters_config is not None: config["auto_chapters_config"] = AutoChaptersConfig() + audio_events_config = config.get("audio_events_config", None) + arg_audio_events = args.get("audio_events") + if audio_events_config or arg_audio_events is not None: + types = None + if audio_events_config and audio_events_config.get("types"): + types = audio_events_config.get("types") + config["audio_events_config"] = AudioEventsConfig(types) + if args["mode"] == "rt": # pylint: disable=unexpected-keyword-arg return TranscriptionConfig(**config) @@ -448,6 +457,14 @@ def transcript_handler(message): sys.stdout.write(f"{escape_seq}{plaintext}\n") transcripts.text += plaintext + def audio_event_handler(message): + if print_json: + print(json.dumps(message)) + return + event_name = message["event"].get("type", "").upper() + sys.stdout.write(f"{escape_seq}[{event_name}]\n") + transcripts.text += f"[{event_name}] " + def partial_translation_handler(message): if print_json: print(json.dumps(message)) @@ -480,6 +497,8 @@ def end_of_transcript_handler(_): # print both transcription and translation messages (if json was requested) # print translation (if text was requested then) # print transcription (if text was requested without translation) + + api.add_event_handler(ServerMessageType.AudioEventStarted, audio_event_handler) if print_json: if enable_partials or enable_translation_partials: api.add_event_handler( diff --git a/speechmatics/cli_parser.py b/speechmatics/cli_parser.py index 8170004..97aec2f 100644 --- a/speechmatics/cli_parser.py +++ b/speechmatics/cli_parser.py @@ -467,6 +467,12 @@ def get_arg_parser(): help="Which type of diarization to use.", ) + rt_transcribe_command_parser.add_argument( + "--audio-events", + action="store_true", + help="Enable audio event detection and print events in square-brackets to the console, e.g. [MUSIC]", + ) + # Build our actual parsers. mode_subparsers = parser.add_subparsers(title="Mode", dest="mode") diff --git a/speechmatics/client.py b/speechmatics/client.py index c9869e5..43e3ebe 100644 --- a/speechmatics/client.py +++ b/speechmatics/client.py @@ -72,7 +72,6 @@ def __init__( self.connection_settings.set_missing_values_from_config(UsageMode.RealTime) self.websocket = None self.transcription_config = None - self.translation_config = None self.event_handlers = {x: [] for x in ServerMessageType} self.middlewares = {x: [] for x in ClientMessageType} @@ -135,12 +134,19 @@ def _set_recognition_config(self): :py:attr:`speechmatics.models.ClientMessageType.SetRecognitionConfig` message. """ + assert self.transcription_config is not None msg = { "message": ClientMessageType.SetRecognitionConfig, "transcription_config": self.transcription_config.as_config(), } - if self.translation_config is not None: - msg["translation_config"] = self.translation_config.asdict() + if self.transcription_config.translation_config is not None: + msg[ + "translation_config" + ] = self.transcription_config.translation_config.asdict() + if self.transcription_config.audio_events_config is not None: + msg[ + "audio_events_config" + ] = self.transcription_config.audio_events_config.asdict() self._call_middleware(ClientMessageType.SetRecognitionConfig, msg, False) return msg @@ -155,13 +161,20 @@ def _start_recognition(self, audio_settings): :param audio_settings: Audio settings to use. :type audio_settings: speechmatics.models.AudioSettings """ + assert self.transcription_config is not None msg = { "message": ClientMessageType.StartRecognition, "audio_format": audio_settings.asdict(), "transcription_config": self.transcription_config.as_config(), } - if self.translation_config is not None: - msg["translation_config"] = self.translation_config.asdict() + if self.transcription_config.translation_config is not None: + msg[ + "translation_config" + ] = self.transcription_config.translation_config.asdict() + if self.transcription_config.audio_events_config is not None: + msg[ + "audio_events_config" + ] = self.transcription_config.audio_events_config.asdict() self.session_running = True self._call_middleware(ClientMessageType.StartRecognition, msg, False) LOGGER.debug(msg) @@ -435,7 +448,6 @@ async def run( consumer/producer tasks. """ self.transcription_config = transcription_config - self.translation_config = transcription_config.translation_config self.seq_no = 0 self._language_pack_info = None await self._init_synchronization_primitives() diff --git a/speechmatics/models.py b/speechmatics/models.py index 3461201..13079ab 100644 --- a/speechmatics/models.py +++ b/speechmatics/models.py @@ -176,7 +176,7 @@ class BatchTranslationConfig(TranslationConfig): class BatchLanguageIdentificationConfig: """Batch mode: Language identification config.""" - expected_languages: List[str] = None + expected_languages: Optional[List[str]] = None """Expected languages for language identification""" @@ -203,7 +203,7 @@ class SentimentAnalysisConfig: class TopicDetectionConfig: """Defines topic detection parameters.""" - topics: List[str] = None + topics: Optional[List[str]] = None """Optional list of topics for topic detection.""" @@ -212,6 +212,18 @@ class AutoChaptersConfig: """Auto Chapters config.""" +@dataclass +class AudioEventsConfig: + + types: Optional[List[str]] + """Optional list of audio event types to detect.""" + + def asdict(self): + if self.types is None: + self.types = [] + return asdict(self) + + @dataclass(init=False) class TranscriptionConfig(_TranscriptionConfig): # pylint: disable=too-many-instance-attributes @@ -254,12 +266,16 @@ class TranscriptionConfig(_TranscriptionConfig): """Indicates if partial translation, where words are produced immediately, is enabled.""" - translation_config: TranslationConfig = None + translation_config: Optional[TranslationConfig] = None """Optional configuration for translation.""" + audio_events_config: Optional[AudioEventsConfig] = None + """Optional configuration for audio events""" + def as_config(self): dictionary = self.asdict() dictionary.pop("translation_config", None) + dictionary.pop("audio_events_config", None) dictionary.pop("enable_translation_partials", None) enable_transcription_partials = dictionary.pop( "enable_transcription_partials", False @@ -504,6 +520,12 @@ class ServerMessageType(str, Enum): AddTranscript = "AddTranscript" """Indicates the final transcript of a part of the audio.""" + AudioEventStarted = "AudioEventStarted" + """Indicates the start of an audio event.""" + + AudioEventEnded = "AudioEventEnded" + """Indicates the end of an audio event.""" + AddPartialTranslation = "AddPartialTranslation" """Indicates a partial translation, which is an incomplete translation that is immediately produced and may change as more context becomes available.