diff options
author | morpheus65535 <[email protected]> | 2022-01-23 23:07:52 -0500 |
---|---|---|
committer | morpheus65535 <[email protected]> | 2022-01-23 23:07:52 -0500 |
commit | 0c3c5a02a75bc61b6bf6e303de20e11741d2afac (patch) | |
tree | 30ae1d524ffe5d54172b7a4a8445d90c3461e659 /libs/auditok | |
parent | 36bf0d219d0432c20e6314e0ce752b36f4d88e3c (diff) | |
download | bazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.tar.gz bazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.zip |
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.v1.0.3-beta.16
Diffstat (limited to 'libs/auditok')
-rw-r--r-- | libs/auditok/__init__.py | 10 | ||||
-rwxr-xr-x | libs/auditok/cmdline.py | 1155 | ||||
-rwxr-xr-x | libs/auditok/cmdline_util.py | 126 | ||||
-rw-r--r-- | libs/auditok/core.py | 1656 | ||||
-rw-r--r-- | libs/auditok/dataset.py | 24 | ||||
-rw-r--r-- | libs/auditok/exceptions.py | 42 | ||||
-rw-r--r-- | libs/auditok/io.py | 1264 | ||||
-rwxr-xr-x | libs/auditok/plotting.py | 150 | ||||
-rw-r--r-- | libs/auditok/signal.py | 179 | ||||
-rw-r--r-- | libs/auditok/signal_numpy.py | 30 | ||||
-rw-r--r-- | libs/auditok/util.py | 1734 | ||||
-rwxr-xr-x | libs/auditok/workers.py | 427 |
12 files changed, 4572 insertions, 2225 deletions
diff --git a/libs/auditok/__init__.py b/libs/auditok/__init__.py index 4ea697b77..edd336cc3 100644 --- a/libs/auditok/__init__.py +++ b/libs/auditok/__init__.py @@ -2,20 +2,16 @@ :author: Amine SEHILI <[email protected]> -2015-2016 +2015-2021 :License: -This package is published under GNU GPL Version 3. +This package is published under the MIT license. """ -from __future__ import absolute_import from .core import * from .io import * from .util import * -from . import dataset from .exceptions import * -__version__ = "0.1.5" - - +__version__ = "0.2.0" diff --git a/libs/auditok/cmdline.py b/libs/auditok/cmdline.py index b6a51d11b..7e7450762 100755 --- a/libs/auditok/cmdline.py +++ b/libs/auditok/cmdline.py @@ -1,789 +1,428 @@ #!/usr/bin/env python # encoding: utf-8 -''' -auditok.auditok -- Audio Activity Detection tool - -auditok.auditok is a program that can be used for Audio/Acoustic activity detection. -It can read audio data from audio files as well as from built-in device(s) or standard input +""" +`auditok` -- An Audio Activity Detection tool +`auditok` is a program that can be used for Audio/Acoustic +activity detection. It can read audio data from audio files as well +as from the microphone or standard input. @author: Mohamed El Amine SEHILI - -@copyright: 2015 Mohamed El Amine SEHILI - -@license: GPL v3 - +@copyright: 2015-2021 Mohamed El Amine SEHILI +@license: MIT @contact: [email protected] -@deffield updated: 02 Dec 2015 -''' +@deffield updated: 01 Mar 2021 +""" import sys import os - -from optparse import OptionParser, OptionGroup -from threading import Thread -import tempfile -import wave +from argparse import ArgumentParser import time import threading -import logging -try: - import future - from queue import Queue, Empty -except ImportError: - if sys.version_info >= (3, 0): - from queue import Queue, Empty - else: - from Queue import Queue, Empty +from auditok import __version__, AudioRegion +from .util import AudioDataSource +from .exceptions import EndOfProcessing, AudioEncodingWarning +from .io import player_for +from .cmdline_util import make_logger, make_kwargs, initialize_workers +from . import workers -try: - from pydub import AudioSegment - WITH_PYDUB = True -except ImportError: - WITH_PYDUB = False - - -from .core import StreamTokenizer -from .io import PyAudioSource, BufferAudioSource, StdinAudioSource, player_for -from .util import ADSFactory, AudioEnergyValidator -from auditok import __version__ as version __all__ = [] -__version__ = version -__date__ = '2015-11-23' -__updated__ = '2015-03-11' - -DEBUG = 0 -TESTRUN = 1 -PROFILE = 0 - -LOGGER_NAME = "AUDITOK_LOGGER" - -class AudioFileFormatError(Exception): - pass - -class TimeFormatError(Exception): - pass - -def file_to_audio_source(filename, filetype=None, **kwargs): - - lower_fname = filename.lower() - rawdata = False - - if filetype is not None: - filetype = filetype.lower() - - if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")): - - srate = kwargs.pop("sampling_rate", None) - if srate is None: - srate = kwargs.pop("sr", None) - - swidth = kwargs.pop("sample_width", None) - if swidth is None: - swidth = kwargs.pop("sw", None) - - ch = kwargs.pop("channels", None) - if ch is None: - ch = kwargs.pop("ch", None) - - if None in (swidth, srate, ch): - raise Exception("All audio parameters are required for raw data") - - data = open(filename).read() - rawdata = True - - # try first with pydub - if WITH_PYDUB: - - use_channel = kwargs.pop("use_channel", None) - if use_channel is None: - use_channel = kwargs.pop("uc", None) - - if use_channel is None: - use_channel = 1 - else: - try: - use_channel = int(use_channel) - except ValueError: - pass - - if not isinstance(use_channel, (int)) and not use_channel.lower() in ["left", "right", "mix"] : - raise ValueError("channel must be an integer or one of 'left', 'right' or 'mix'") - - asegment = None - - if rawdata: - asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch) - if filetype in("wave", "wav") or (filetype is None and lower_fname.endswith(".wav")): - asegment = AudioSegment.from_wav(filename) - elif filetype == "mp3" or (filetype is None and lower_fname.endswith(".mp3")): - asegment = AudioSegment.from_mp3(filename) - elif filetype == "ogg" or (filetype is None and lower_fname.endswith(".ogg")): - asegment = AudioSegment.from_ogg(filename) - elif filetype == "flv" or (filetype is None and lower_fname.endswith(".flv")): - asegment = AudioSegment.from_flv(filename) - else: - asegment = AudioSegment.from_file(filename) - - if asegment.channels > 1: - - if isinstance(use_channel, int): - if use_channel > asegment.channels: - raise ValueError("Can not use channel '{0}', audio file has only {1} channels".format(use_channel, asegment.channels)) - else: - asegment = asegment.split_to_mono()[use_channel - 1] - else: - ch_lower = use_channel.lower() - - if ch_lower == "mix": - asegment = asegment.set_channels(1) - - elif use_channel.lower() == "left": - asegment = asegment.split_to_mono()[0] - - elif use_channel.lower() == "right": - asegment = asegment.split_to_mono()[1] - - return BufferAudioSource(data_buffer = asegment._data, - sampling_rate = asegment.frame_rate, - sample_width = asegment.sample_width, - channels = asegment.channels) - # fall back to standard python - else: - if rawdata: - if ch != 1: - raise ValueError("Cannot handle multi-channel audio without pydub") - return BufferAudioSource(data, srate, swidth, ch) - - if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")): - - wfp = wave.open(filename) - - ch = wfp.getnchannels() - if ch != 1: - wfp.close() - raise ValueError("Cannot handle multi-channel audio without pydub") - - srate = wfp.getframerate() - swidth = wfp.getsampwidth() - data = wfp.readframes(wfp.getnframes()) - wfp.close() - return BufferAudioSource(data, srate, swidth, ch) - - raise AudioFileFormatError("Cannot read audio file format") - - -def save_audio_data(data, filename, filetype=None, **kwargs): - - lower_fname = filename.lower() - if filetype is not None: - filetype = filetype.lower() - - # save raw data - if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")): - fp = open(filename, "w") - fp.write(data) - fp.close() - return - - # save other types of data - # requires all audio parameters - srate = kwargs.pop("sampling_rate", None) - if srate is None: - srate = kwargs.pop("sr", None) - - swidth = kwargs.pop("sample_width", None) - if swidth is None: - swidth = kwargs.pop("sw", None) - - ch = kwargs.pop("channels", None) - if ch is None: - ch = kwargs.pop("ch", None) - - if None in (swidth, srate, ch): - raise Exception("All audio parameters are required to save no raw data") - - if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")): - # use standard python's wave module - fp = wave.open(filename, "w") - fp.setnchannels(ch) - fp.setsampwidth(swidth) - fp.setframerate(srate) - fp.writeframes(data) - fp.close() - - elif WITH_PYDUB: - - asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch) - asegment.export(filename, format=filetype) - - else: - raise AudioFileFormatError("cannot write file format {0} (file name: {1})".format(filetype, filename)) - - -def plot_all(signal, sampling_rate, energy_as_amp, detections=[], show=True, save_as=None): - - import matplotlib.pyplot as plt - import numpy as np - t = np.arange(0., np.ceil(float(len(signal))) / sampling_rate, 1./sampling_rate ) - if len(t) > len(signal): - t = t[: len(signal) - len(t)] - - for start, end in detections: - p = plt.axvspan(start, end, facecolor='g', ec = 'r', lw = 2, alpha=0.4) - - line = plt.axhline(y=energy_as_amp, lw=1, ls="--", c="r", label="Energy threshold as normalized amplitude") - plt.plot(t, signal) - legend = plt.legend(["Detection threshold"], bbox_to_anchor=(0., 1.02, 1., .102), loc=1, fontsize=16) - ax = plt.gca().add_artist(legend) - - plt.xlabel("Time (s)", fontsize=24) - plt.ylabel("Amplitude (normalized)", fontsize=24) - - if save_as is not None: - plt.savefig(save_as, dpi=120) - - if show: - plt.show() - - -def seconds_to_str_fromatter(_format): - """ - Accepted format directives: %i %s %m %h - """ - # check directives are correct - - if _format == "%S": - def _fromatter(seconds): - return "{:.2f}".format(seconds) - - elif _format == "%I": - def _fromatter(seconds): - return "{0}".format(int(seconds * 1000)) - - else: - _format = _format.replace("%h", "{hrs:02d}") - _format = _format.replace("%m", "{mins:02d}") - _format = _format.replace("%s", "{secs:02d}") - _format = _format.replace("%i", "{millis:03d}") - - try: - i = _format.index("%") - raise TimeFormatError("Unknow time format directive '{0}'".format(_format[i:i+2])) - except ValueError: - pass - - def _fromatter(seconds): - millis = int(seconds * 1000) - hrs, millis = divmod(millis, 3600000) - mins, millis = divmod(millis, 60000) - secs, millis = divmod(millis, 1000) - return _format.format(hrs=hrs, mins=mins, secs=secs, millis=millis) - - return _fromatter - - - -class Worker(Thread): - - def __init__(self, timeout=0.2, debug=False, logger=None): - self.timeout = timeout - self.debug = debug - self.logger = logger - - if self.debug and self.logger is None: - self.logger = logging.getLogger(LOGGER_NAME) - self.logger.setLevel(logging.DEBUG) - handler = logging.StreamHandler(sys.stdout) - self.logger.addHandler(handler) - - self._inbox = Queue() - self._stop_request = Queue() - Thread.__init__(self) - - - def debug_message(self, message): - self.logger.debug(message) - - def _stop_requested(self): - - try: - message = self._stop_request.get_nowait() - if message == "stop": - return True - - except Empty: - return False - - def stop(self): - self._stop_request.put("stop") - self.join() - - def send(self, message): - self._inbox.put(message) - - def _get_message(self): - try: - message = self._inbox.get(timeout=self.timeout) - return message - except Empty: - return None - - -class TokenizerWorker(Worker): - - END_OF_PROCESSING = "END_OF_PROCESSING" - - def __init__(self, ads, tokenizer, analysis_window, observers): - self.ads = ads - self.tokenizer = tokenizer - self.analysis_window = analysis_window - self.observers = observers - self._inbox = Queue() - self.count = 0 - Worker.__init__(self) - - def run(self): - - def notify_observers(data, start, end): - audio_data = b''.join(data) - self.count += 1 - - start_time = start * self.analysis_window - end_time = (end+1) * self.analysis_window - duration = (end - start + 1) * self.analysis_window - - # notify observers - for observer in self.observers: - observer.notify({"id" : self.count, - "audio_data" : audio_data, - "start" : start, - "end" : end, - "start_time" : start_time, - "end_time" : end_time, - "duration" : duration} - ) - - self.ads.open() - self.tokenizer.tokenize(data_source=self, callback=notify_observers) - for observer in self.observers: - observer.notify(TokenizerWorker.END_OF_PROCESSING) - - def add_observer(self, observer): - self.observers.append(observer) - - def remove_observer(self, observer): - self.observers.remove(observer) - - def read(self): - if self._stop_requested(): - return None - else: - return self.ads.read() - - -class PlayerWorker(Worker): - - def __init__(self, player, timeout=0.2, debug=False, logger=None): - self.player = player - Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) - - def run(self): - while True: - if self._stop_requested(): - break - - message = self._get_message() - if message is not None: - if message == TokenizerWorker.END_OF_PROCESSING: - break - - audio_data = message.pop("audio_data", None) - start_time = message.pop("start_time", None) - end_time = message.pop("end_time", None) - dur = message.pop("duration", None) - _id = message.pop("id", None) - - if audio_data is not None: - if self.debug: - self.debug_message("[PLAY]: Detection {id} played (start:{start}, end:{end}, dur:{dur})".format(id=_id, - start="{:5.2f}".format(start_time), end="{:5.2f}".format(end_time), dur="{:5.2f}".format(dur))) - self.player.play(audio_data) - - def notify(self, message): - self.send(message) - - -class CommandLineWorker(Worker): - - def __init__(self, command, timeout=0.2, debug=False, logger=None): - self.command = command - Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) - - def run(self): - while True: - if self._stop_requested(): - break - - message = self._get_message() - if message is not None: - if message == TokenizerWorker.END_OF_PROCESSING: - break - - audio_data = message.pop("audio_data", None) - _id = message.pop("id", None) - if audio_data is not None: - raw_audio_file = tempfile.NamedTemporaryFile(delete=False) - raw_audio_file.write(audio_data) - cmd = self.command.replace("$", raw_audio_file.name) - if self.debug: - self.debug_message("[CMD ]: Detection {id} command: {cmd}".format(id=_id, cmd=cmd)) - os.system(cmd) - os.unlink(raw_audio_file.name) - - def notify(self, message): - self.send(message) - - -class TokenSaverWorker(Worker): - - def __init__(self, name_format, filetype, timeout=0.2, debug=False, logger=None, **kwargs): - self.name_format = name_format - self.filetype = filetype - self.kwargs = kwargs - Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) - - def run(self): - while True: - if self._stop_requested(): - break - - message = self._get_message() - if message is not None: - if message == TokenizerWorker.END_OF_PROCESSING: - break - - audio_data = message.pop("audio_data", None) - start_time = message.pop("start_time", None) - end_time = message.pop("end_time", None) - _id = message.pop("id", None) - if audio_data is not None and len(audio_data) > 0: - fname = self.name_format.format(N=_id, start = "{:.2f}".format(start_time), end = "{:.2f}".format(end_time)) - try: - if self.debug: - self.debug_message("[SAVE]: Detection {id} saved as {fname}".format(id=_id, fname=fname)) - save_audio_data(audio_data, fname, filetype=self.filetype, **self.kwargs) - except Exception as e: - sys.stderr.write(str(e) + "\n") - - def notify(self, message): - self.send(message) - - -class LogWorker(Worker): - - def __init__(self, print_detections=False, output_format="{start} {end}", - time_formatter=seconds_to_str_fromatter("%S"), timeout=0.2, debug=False, logger=None): - - self.print_detections = print_detections - self.output_format = output_format - self.time_formatter = time_formatter - self.detections = [] - Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) - - def run(self): - while True: - if self._stop_requested(): - break - - message = self._get_message() - - if message is not None: - - if message == TokenizerWorker.END_OF_PROCESSING: - break - - audio_data = message.pop("audio_data", None) - _id = message.pop("id", None) - start = message.pop("start", None) - end = message.pop("end", None) - start_time = message.pop("start_time", None) - end_time = message.pop("end_time", None) - if audio_data is not None and len(audio_data) > 0: - - if self.debug: - self.debug_message("[DET ]: Detection {id} (start:{start}, end:{end})".format(id=_id, - start="{:5.2f}".format(start_time), - end="{:5.2f}".format(end_time))) - - if self.print_detections: - print(self.output_format.format(id = _id, - start = self.time_formatter(start_time), - end = self.time_formatter(end_time))) - - self.detections.append((_id, start, end, start_time, end_time)) - - - def notify(self, message): - self.send(message) - +__date__ = "2015-11-23" +__updated__ = "2021-03-01" def main(argv=None): - '''Command line options.''' - program_name = os.path.basename(sys.argv[0]) - program_version = version - program_build_date = "%s" % __updated__ - - program_version_string = '%%prog %s (%s)' % (program_version, program_build_date) - #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse - program_longdesc = '''''' # optional - give further explanation about what the program does - program_license = "Copyright 2015 Mohamed El Amine SEHILI \ - Licensed under the General Public License (GPL) Version 3 \nhttp://www.gnu.org/licenses/" - if argv is None: argv = sys.argv[1:] try: - # setup option parser - parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license) - - group = OptionGroup(parser, "[Input-Output options]") - group.add_option("-i", "--input", dest="input", help="Input audio or video file. Use - for stdin [default: read from microphone using pyaudio]", metavar="FILE") - group.add_option("-t", "--input-type", dest="input_type", help="Input audio file type. Mandatory if file name has no extension [default: %default]", type=str, default=None, metavar="String") - group.add_option("-M", "--max_time", dest="max_time", help="Max data (in seconds) to read from microphone/file [default: read until the end of file/stream]", type=float, default=None, metavar="FLOAT") - group.add_option("-O", "--output-main", dest="output_main", help="Save main stream as. If omitted main stream will not be saved [default: omitted]", type=str, default=None, metavar="FILE") - group.add_option("-o", "--output-tokens", dest="output_tokens", help="Output file name format for detections. Use {N} and {start} and {end} to build file names, example: 'Det_{N}_{start}-{end}.wav'", type=str, default=None, metavar="STRING") - group.add_option("-T", "--output-type", dest="output_type", help="Audio type used to save detections and/or main stream. If not supplied will: (1). guess from extension or (2). use wav format", type=str, default=None, metavar="STRING") - group.add_option("-u", "--use-channel", dest="use_channel", help="Choose channel to use from a multi-channel audio file (requires pydub). 'left', 'right' and 'mix' are accepted values. [Default: 1 (i.e. 1st or left channel)]", type=str, default="1", metavar="STRING") - parser.add_option_group(group) - - - group = OptionGroup(parser, "[Tokenization options]", "Set tokenizer options and energy threshold.") - group.add_option("-a", "--analysis-window", dest="analysis_window", help="Size of analysis window in seconds [default: %default (10ms)]", type=float, default=0.01, metavar="FLOAT") - group.add_option("-n", "--min-duration", dest="min_duration", help="Min duration of a valid audio event in seconds [default: %default]", type=float, default=0.2, metavar="FLOAT") - group.add_option("-m", "--max-duration", dest="max_duration", help="Max duration of a valid audio event in seconds [default: %default]", type=float, default=5, metavar="FLOAT") - group.add_option("-s", "--max-silence", dest="max_silence", help="Max duration of a consecutive silence within a valid audio event in seconds [default: %default]", type=float, default=0.3, metavar="FLOAT") - group.add_option("-d", "--drop-trailing-silence", dest="drop_trailing_silence", help="Drop trailing silence from a detection [default: keep trailing silence]", action="store_true", default=False) - group.add_option("-e", "--energy-threshold", dest="energy_threshold", help="Log energy threshold for detection [default: %default]", type=float, default=50, metavar="FLOAT") - parser.add_option_group(group) - - - group = OptionGroup(parser, "[Audio parameters]", "Define audio parameters if data is read from a headerless file (raw or stdin) or you want to use different microphone parameters.") - group.add_option("-r", "--rate", dest="sampling_rate", help="Sampling rate of audio data [default: %default]", type=int, default=16000, metavar="INT") - group.add_option("-c", "--channels", dest="channels", help="Number of channels of audio data [default: %default]", type=int, default=1, metavar="INT") - group.add_option("-w", "--width", dest="sample_width", help="Number of bytes per audio sample [default: %default]", type=int, default=2, metavar="INT") - parser.add_option_group(group) - - group = OptionGroup(parser, "[Do something with detections]", "Use these options to print, play or plot detections.") - group.add_option("-C", "--command", dest="command", help="Command to call when an audio detection occurs. Use $ to represent the file name to use with the command (e.g. -C 'du -h $')", default=None, type=str, metavar="STRING") - group.add_option("-E", "--echo", dest="echo", help="Play back each detection immediately using pyaudio [default: do not play]", action="store_true", default=False) - group.add_option("-p", "--plot", dest="plot", help="Plot and show audio signal and detections (requires matplotlib)", action="store_true", default=False) - group.add_option("", "--save-image", dest="save_image", help="Save plotted audio signal and detections as a picture or a PDF file (requires matplotlib)", type=str, default=None, metavar="FILE") - group.add_option("", "--printf", dest="printf", help="print detections one per line using a user supplied format (e.g. '[{id}]: {start} -- {end}'). Available keywords {id}, {start} and {end}", type=str, default="{id} {start} {end}", metavar="STRING") - group.add_option("", "--time-format", dest="time_format", help="format used to print {start} and {end}. [Default= %default]. %S: absolute time in sec. %I: absolute time in ms. If at least one of (%h, %m, %s, %i) is used, convert time into hours, minutes, seconds and millis (e.g. %h:%m:%s.%i). Only required fields are printed", type=str, default="%S", metavar="STRING") - parser.add_option_group(group) - - parser.add_option("-q", "--quiet", dest="quiet", help="Do not print any information about detections [default: print 'id', 'start' and 'end' of each detection]", action="store_true", default=False) - parser.add_option("-D", "--debug", dest="debug", help="Print processing operations to STDOUT", action="store_true", default=False) - parser.add_option("", "--debug-file", dest="debug_file", help="Print processing operations to FILE", type=str, default=None, metavar="FILE") - - + parser = ArgumentParser( + prog=program_name, description="An Audio Tokenization tool" + ) + parser.add_argument( + "--version", "-v", action="version", version=__version__ + ) + group = parser.add_argument_group("Input-Output options") + group.add_argument( + dest="input", + help="Input audio or video file. Use '-' for stdin " + "[default: read from microphone using pyaudio]", + metavar="input", + nargs="?", + default=None, + ) + group.add_argument( + "-I", + "--input-device-index", + dest="input_device_index", + help="Audio device index [default: %(default)s]. " + "Optional and only effective when using PyAudio", + type=int, + default=None, + metavar="INT", + ) + group.add_argument( + "-F", + "--audio-frame-per-buffer", + dest="frame_per_buffer", + help="Audio frame per buffer [default: %(default)s]. " + "Optional and only effective when using PyAudio", + type=int, + default=1024, + metavar="INT", + ) + group.add_argument( + "-f", + "--input-format", + dest="input_format", + type=str, + default=None, + help="Input audio file format. If not given, guess format from " + "extension. If output file name has no extension, guess format " + "from file header (requires pydub). If none of the previous is " + "true, raise an error", + metavar="STRING", + ) + group.add_argument( + "-M", + "--max-read", + dest="max_read", + type=float, + default=None, + help="Maximum data (in seconds) to read from microphone or file " + "[default: read until the end of file/stream]", + metavar="FLOAT", + ) + group.add_argument( + "-L", + "--large-file", + dest="large_file", + action="store_true", + default=False, + help="Whether input file should be treated as a large file. " + "If True, data will be read from file on demand, otherwise all " + "audio data is loaded to memory before tokenization.", + ) + group.add_argument( + "-O", + "--save-stream", + dest="save_stream", + type=str, + default=None, + help="Save acquired audio data (from file or microphone) to disk." + " If omitted no data will be saved. [default: omitted]", + metavar="FILE", + ) + group.add_argument( + "-o", + "--save-detections-as", + dest="save_detections_as", + type=str, + default=None, + help="File name format for detections." + "The following placeholders can be used to build output file name " + "for each detection: {id} (sequential, starts from 1), {start}, " + "{end} and {duration}. Time placeholders are in seconds. " + "Example: 'Event_{id}_{start}-{end}_{duration:.3f}.wav'", + metavar="STRING", + ) + group.add_argument( + "-T", + "--output-format", + dest="output_format", + type=str, + default=None, + help="Audio format used to save detections and/or main stream. " + "If not supplied, then it will: (1. be guessed from extension or " + "(2. use raw format", + metavar="STRING", + ) + group.add_argument( + "-u", + "--use-channel", + dest="use_channel", + type=str, + default=None, + help="Which channel to use for tokenization when input stream is " + "multi-channel (0 is the first channel). Default is None, meaning " + "that all channels will be considered for tokenization (i.e., get " + "any valid audio event regardless of the channel it occurs in). " + "This value can also be 'mix' (alias 'avg' or 'average') and " + "means mix down all audio channels into one channel (i.e. compute " + "average channel) and use the resulting channel for tokenization. " + "Whatever option is used, saved audio events will contain the same" + " number of channels as input stream. " + "[Default: None, use all channels]", + metavar="INT/STRING", + ) + + group = parser.add_argument_group( + "Tokenization options", "Set tokenizer options." + ) + group.add_argument( + "-a", + "--analysis-window", + dest="analysis_window", + default=0.01, + type=float, + help="Size of analysis window in seconds [default: %(default)s " + "(10ms)]", + metavar="FLOAT", + ) + group.add_argument( + "-n", + "--min-duration", + dest="min_duration", + type=float, + default=0.2, + help="Min duration of a valid audio event in seconds " + "[default: %(default)s]", + metavar="FLOAT", + ) + group.add_argument( + "-m", + "--max-duration", + dest="max_duration", + type=float, + default=5, + help="Max duration of a valid audio event in seconds " + "[default: %(default)s]", + metavar="FLOAT", + ) + group.add_argument( + "-s", + "--max-silence", + dest="max_silence", + type=float, + default=0.3, + help="Max duration of a consecutive silence within a valid audio " + "event in seconds [default: %(default)s]", + metavar="FLOAT", + ) + group.add_argument( + "-d", + "--drop-trailing-silence", + dest="drop_trailing_silence", + action="store_true", + default=False, + help="Drop trailing silence from a detection [default: keep " + "trailing silence]", + ) + group.add_argument( + "-R", + "--strict-min-duration", + dest="strict_min_duration", + action="store_true", + default=False, + help="Reject an event shorter than --min-duration even if it's " + "adjacent to the latest valid event that reached max-duration " + "[default: keep such events]", + ) + group.add_argument( + "-e", + "--energy-threshold", + dest="energy_threshold", + type=float, + default=50, + help="Log energy threshold for detection [default: %(default)s]", + metavar="FLOAT", + ) + + group = parser.add_argument_group( + "Audio parameters", + "Define audio parameters if data is read from a " + "headerless file (raw or stdin) or you want to use " + "different microphone parameters.", + ) + group.add_argument( + "-r", + "--rate", + dest="sampling_rate", + type=int, + default=16000, + help="Sampling rate of audio data [default: %(default)s]", + metavar="INT", + ) + group.add_argument( + "-c", + "--channels", + dest="channels", + type=int, + default=1, + help="Number of channels of audio data [default: %(default)s]", + metavar="INT", + ) + group.add_argument( + "-w", + "--width", + dest="sample_width", + type=int, + default=2, + help="Number of bytes per audio sample [default: %(default)s]", + metavar="INT", + ) + + group = parser.add_argument_group( + "Do something with audio events", + "Use these options to print, play back or plot detections.", + ) + group.add_argument( + "-C", + "--command", + dest="command", + type=str, + help="Command to call when an audio detection occurs. Use '{file}' " + "as a placeholder for the temporary wav file that will contain " + "event's data (e.g., \"-C 'du -h {file}'\" to print out file size " + " or \"-C 'play -q {file}'\" to play audio with sox)", + metavar="STRING", + ) + group.add_argument( + "-E", + "--echo", + dest="echo", + action="store_true", + default=False, + help="Play back each detection immediately using pyaudio", + ) + group.add_argument( + "-B", + "--progress-bar", + dest="progress_bar", + action="store_true", + default=False, + help="Show a progress bar when playing audio", + ) + group.add_argument( + "-p", + "--plot", + dest="plot", + action="store_true", + default=False, + help="Plot and show audio signal and detections (requires " + "matplotlib)", + ) + group.add_argument( + "--save-image", + dest="save_image", + type=str, + help="Save plotted audio signal and detections as a picture or a " + "PDF file (requires matplotlib)", + metavar="FILE", + ) + group.add_argument( + "--printf", + dest="printf", + type=str, + default="{id} {start} {end}", + help="Print audio events information, one per line, using this " + "format. Format can contain text with the following placeholders: " + "{id} (sequential, starts from 1), {start}, {end}, {duration} and " + "{timestamp}. The first 3 time placeholders are in seconds and " + "their format can be set using --time-format argument. " + "{timestamp} is the system timestamp (date and time) of the event " + "and can be set using --timestamp-format argument.\n" + "Example: '[{id}]: {start} -> {end} -- {timestamp}'", + metavar="STRING", + ) + group.add_argument( + "--time-format", + dest="time_format", + type=str, + default="%S", + help="Format used to print {start}, {end} and {duration} " + "placeholders used with --printf [default= %(default)s]. The " + "following formats are accepted:\n" + "%%S: absolute time in seconds. %%I: absolute time in ms. If at " + "least one of (%%h, %%m, %%s, %%i) is used, convert time into " + "hours, minutes, seconds and millis (e.g. %%h:%%m:%%s.%%i). Only " + "supplied fields are printed. Note that %%S and %%I can only be " + "used alone", + metavar="STRING", + ) + group.add_argument( + "--timestamp-format", + dest="timestamp_format", + type=str, + default="%Y/%m/%d %H:%M:%S", + help="Format used to print {timestamp}. Should be a format " + "accepted by 'datetime' standard module. Default: " + "'%%Y/%%m/%%d %%H:%%M:%%S'", + ) + parser.add_argument( + "-q", + "--quiet", + dest="quiet", + action="store_true", + default=False, + help="Do not print any information about detections [default: " + "print 'id', 'start' and 'end' of each detection]", + ) + parser.add_argument( + "-D", + "--debug", + dest="debug", + action="store_true", + default=False, + help="Print processing operations to STDOUT", + ) + parser.add_argument( + "--debug-file", + dest="debug_file", + type=str, + default=None, + help="Print processing operations to FILE", + metavar="FILE", + ) + + args = parser.parse_args(argv) + logger = make_logger(args.debug, args.debug_file) + kwargs = make_kwargs(args) + reader, observers = initialize_workers( + logger=logger, **kwargs.io, **kwargs.miscellaneous + ) + tokenizer_worker = workers.TokenizerWorker( + reader, observers, logger=logger, **kwargs.split + ) + tokenizer_worker.start_all() - # process options - (opts, args) = parser.parse_args(argv) - - if opts.input == "-": - asource = StdinAudioSource(sampling_rate = opts.sampling_rate, - sample_width = opts.sample_width, - channels = opts.channels) - #read data from a file - elif opts.input is not None: - asource = file_to_audio_source(filename=opts.input, filetype=opts.input_type, uc=opts.use_channel) - - # read data from microphone via pyaudio - else: - try: - asource = PyAudioSource(sampling_rate = opts.sampling_rate, - sample_width = opts.sample_width, - channels = opts.channels) - except Exception: - sys.stderr.write("Cannot read data from audio device!\n") - sys.stderr.write("You should either install pyaudio or read data from STDIN\n") - sys.exit(2) - - logger = logging.getLogger(LOGGER_NAME) - logger.setLevel(logging.DEBUG) - - handler = logging.StreamHandler(sys.stdout) - if opts.quiet or not opts.debug: - # only critical messages will be printed - handler.setLevel(logging.CRITICAL) - else: - handler.setLevel(logging.DEBUG) - - logger.addHandler(handler) - - if opts.debug_file is not None: - logger.setLevel(logging.DEBUG) - opts.debug = True - handler = logging.FileHandler(opts.debug_file, "w") - fmt = logging.Formatter('[%(asctime)s] | %(message)s') - handler.setFormatter(fmt) - handler.setLevel(logging.DEBUG) - logger.addHandler(handler) - - record = opts.output_main is not None or opts.plot or opts.save_image is not None - - ads = ADSFactory.ads(audio_source = asource, block_dur = opts.analysis_window, max_time = opts.max_time, record = record) - validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=opts.energy_threshold) - - - if opts.drop_trailing_silence: - mode = StreamTokenizer.DROP_TRAILING_SILENCE - else: - mode = 0 - - analysis_window_per_second = 1. / opts.analysis_window - tokenizer = StreamTokenizer(validator=validator, min_length=opts.min_duration * analysis_window_per_second, - max_length=int(opts.max_duration * analysis_window_per_second), - max_continuous_silence=opts.max_silence * analysis_window_per_second, - mode = mode) - - - observers = [] - tokenizer_worker = None - - if opts.output_tokens is not None: - - try: - # check user format is correct - fname = opts.output_tokens.format(N=0, start=0, end=0) - - # find file type for detections - tok_type = opts.output_type - if tok_type is None: - tok_type = os.path.splitext(opts.output_tokens)[1][1:] - if tok_type == "": - tok_type = "wav" - - token_saver = TokenSaverWorker(name_format=opts.output_tokens, filetype=tok_type, - debug=opts.debug, logger=logger, sr=asource.get_sampling_rate(), - sw=asource.get_sample_width(), - ch=asource.get_channels()) - observers.append(token_saver) - - except Exception: - sys.stderr.write("Wrong format for detections file name: '{0}'\n".format(opts.output_tokens)) - sys.exit(2) - - if opts.echo: - try: - player = player_for(asource) - player_worker = PlayerWorker(player=player, debug=opts.debug, logger=logger) - observers.append(player_worker) - except Exception: - sys.stderr.write("Cannot get an audio player!\n") - sys.stderr.write("You should either install pyaudio or supply a command (-C option) to play audio\n") - sys.exit(2) - - if opts.command is not None and len(opts.command) > 0: - cmd_worker = CommandLineWorker(command=opts.command, debug=opts.debug, logger=logger) - observers.append(cmd_worker) - - if not opts.quiet or opts.plot is not None or opts.save_image is not None: - oformat = opts.printf.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "\r") - converter = seconds_to_str_fromatter(opts.time_format) - log_worker = LogWorker(print_detections = not opts.quiet, output_format=oformat, - time_formatter=converter, logger=logger, debug=opts.debug) - observers.append(log_worker) - - tokenizer_worker = TokenizerWorker(ads, tokenizer, opts.analysis_window, observers) - - def _save_main_stream(): - # find file type - main_type = opts.output_type - if main_type is None: - main_type = os.path.splitext(opts.output_main)[1][1:] - if main_type == "": - main_type = "wav" - ads.close() - ads.rewind() - data = ads.get_audio_source().get_data_buffer() - if len(data) > 0: - save_audio_data(data=data, filename=opts.output_main, filetype=main_type, sr=asource.get_sampling_rate(), - sw = asource.get_sample_width(), - ch = asource.get_channels()) - - def _plot(): - import numpy as np - ads.close() - ads.rewind() - data = ads.get_audio_source().get_data_buffer() - signal = AudioEnergyValidator._convert(data, asource.get_sample_width()) - detections = [(det[3] , det[4]) for det in log_worker.detections] - max_amplitude = 2**(asource.get_sample_width() * 8 - 1) - 1 - energy_as_amp = np.sqrt(np.exp(opts.energy_threshold * np.log(10) / 10)) / max_amplitude - plot_all(signal / max_amplitude, asource.get_sampling_rate(), energy_as_amp, detections, show = opts.plot, save_as = opts.save_image) - - - # start observer threads - for obs in observers: - obs.start() - # start tokenization thread - tokenizer_worker.start() - while True: time.sleep(1) if len(threading.enumerate()) == 1: - break - - tokenizer_worker = None - - if opts.output_main is not None: - _save_main_stream() - if opts.plot or opts.save_image is not None: - _plot() - - return 0 - - except KeyboardInterrupt: - + raise EndOfProcessing + + except (KeyboardInterrupt, EndOfProcessing): if tokenizer_worker is not None: - tokenizer_worker.stop() - for obs in observers: - obs.stop() - - if opts.output_main is not None: - _save_main_stream() - if opts.plot or opts.save_image is not None: - _plot() - + tokenizer_worker.stop_all() + + if isinstance(reader, workers.StreamSaverWorker): + reader.join() + try: + reader.save_stream() + except AudioEncodingWarning as ae_warn: + print(str(ae_warn), file=sys.stderr) + + if args.plot or args.save_image is not None: + from .plotting import plot + + reader.rewind() + record = AudioRegion( + reader.data, reader.sr, reader.sw, reader.ch + ) + detections = ( + (det.start, det.end) for det in tokenizer_worker.detections + ) + plot( + record, + detections=detections, + energy_threshold=args.energy_threshold, + show=True, + save_as=args.save_image, + ) return 0 - except Exception as e: - sys.stderr.write(program_name + ": " + str(e) + "\n") - sys.stderr.write("for help use -h\n") - - return 2 if __name__ == "__main__": - if DEBUG: - sys.argv.append("-h") - if TESTRUN: - import doctest - doctest.testmod() - if PROFILE: - import cProfile - import pstats - profile_filename = 'auditok.auditok_profile.txt' - cProfile.run('main()', profile_filename) - statsfile = open("profile_stats.txt", "wb") - p = pstats.Stats(profile_filename, stream=statsfile) - stats = p.strip_dirs().sort_stats('cumulative') - stats.print_stats() - statsfile.close() - sys.exit(0) - sys.exit(main()) + sys.exit(main(None)) diff --git a/libs/auditok/cmdline_util.py b/libs/auditok/cmdline_util.py new file mode 100755 index 000000000..bde72aa36 --- /dev/null +++ b/libs/auditok/cmdline_util.py @@ -0,0 +1,126 @@ +import sys +import logging +from collections import namedtuple +from . import workers +from .util import AudioDataSource +from .io import player_for + +_AUDITOK_LOGGER = "AUDITOK_LOGGER" +KeywordArguments = namedtuple( + "KeywordArguments", ["io", "split", "miscellaneous"] +) + + +def make_kwargs(args_ns): + if args_ns.save_stream is None: + record = args_ns.plot or (args_ns.save_image is not None) + else: + record = False + try: + use_channel = int(args_ns.use_channel) + except (ValueError, TypeError): + use_channel = args_ns.use_channel + + io_kwargs = { + "input": args_ns.input, + "audio_format": args_ns.input_format, + "max_read": args_ns.max_read, + "block_dur": args_ns.analysis_window, + "sampling_rate": args_ns.sampling_rate, + "sample_width": args_ns.sample_width, + "channels": args_ns.channels, + "use_channel": use_channel, + "save_stream": args_ns.save_stream, + "save_detections_as": args_ns.save_detections_as, + "export_format": args_ns.output_format, + "large_file": args_ns.large_file, + "frames_per_buffer": args_ns.frame_per_buffer, + "input_device_index": args_ns.input_device_index, + "record": record, + } + + split_kwargs = { + "min_dur": args_ns.min_duration, + "max_dur": args_ns.max_duration, + "max_silence": args_ns.max_silence, + "drop_trailing_silence": args_ns.drop_trailing_silence, + "strict_min_dur": args_ns.strict_min_duration, + "energy_threshold": args_ns.energy_threshold, + } + + miscellaneous = { + "echo": args_ns.echo, + "progress_bar": args_ns.progress_bar, + "command": args_ns.command, + "quiet": args_ns.quiet, + "printf": args_ns.printf, + "time_format": args_ns.time_format, + "timestamp_format": args_ns.timestamp_format, + } + return KeywordArguments(io_kwargs, split_kwargs, miscellaneous) + + +def make_logger(stderr=False, file=None, name=_AUDITOK_LOGGER): + if not stderr and file is None: + return None + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + if stderr: + handler = logging.StreamHandler(sys.stderr) + handler.setLevel(logging.INFO) + logger.addHandler(handler) + + if file is not None: + handler = logging.FileHandler(file, "w") + fmt = logging.Formatter("[%(asctime)s] | %(message)s") + handler.setFormatter(fmt) + handler.setLevel(logging.INFO) + logger.addHandler(handler) + return logger + + +def initialize_workers(logger=None, **kwargs): + observers = [] + reader = AudioDataSource(source=kwargs["input"], **kwargs) + if kwargs["save_stream"] is not None: + reader = workers.StreamSaverWorker( + reader, + filename=kwargs["save_stream"], + export_format=kwargs["export_format"], + ) + reader.start() + + if kwargs["save_detections_as"] is not None: + worker = workers.RegionSaverWorker( + kwargs["save_detections_as"], + kwargs["export_format"], + logger=logger, + ) + observers.append(worker) + + if kwargs["echo"]: + player = player_for(reader) + worker = workers.PlayerWorker( + player, progress_bar=kwargs["progress_bar"], logger=logger + ) + observers.append(worker) + + if kwargs["command"] is not None: + worker = workers.CommandLineWorker( + command=kwargs["command"], logger=logger + ) + observers.append(worker) + + if not kwargs["quiet"]: + print_format = ( + kwargs["printf"] + .replace("\\n", "\n") + .replace("\\t", "\t") + .replace("\\r", "\r") + ) + worker = workers.PrintWorker( + print_format, kwargs["time_format"], kwargs["timestamp_format"] + ) + observers.append(worker) + + return reader, observers diff --git a/libs/auditok/core.py b/libs/auditok/core.py index 47441d2b7..af00dc7af 100644 --- a/libs/auditok/core.py +++ b/libs/auditok/core.py @@ -1,264 +1,1267 @@ """ -This module gathers processing (i.e. tokenization) classes. - -Class summary -============= - .. autosummary:: + :toctree: generated/ - StreamTokenizer + load + split + AudioRegion + StreamTokenizer """ +import os +import math +from .util import AudioReader, DataValidator, AudioEnergyValidator +from .io import check_audio_data, to_file, player_for, get_audio_source +from .exceptions import TooSamllBlockDuration + +try: + from . import signal_numpy as signal +except ImportError: + from . import signal + +__all__ = ["load", "split", "AudioRegion", "StreamTokenizer"] + + +DEFAULT_ANALYSIS_WINDOW = 0.05 +DEFAULT_ENERGY_THRESHOLD = 50 +_EPSILON = 1e-10 + + +def load(input, skip=0, max_read=None, **kwargs): + """Load audio data from a source and return it as an :class:`AudioRegion`. + + Parameters + ---------- + input : None, str, bytes, AudioSource + source to read audio data from. If `str`, it should be a path to a + valid audio file. If `bytes`, it is used as raw audio data. If it is + "-", raw data will be read from stdin. If None, read audio data from + the microphone using PyAudio. If of type `bytes` or is a path to a + raw audio file then `sampling_rate`, `sample_width` and `channels` + parameters (or their alias) are required. If it's an + :class:`AudioSource` object it's used directly to read data. + skip : float, default: 0 + amount, in seconds, of audio data to skip from source. If read from + a microphone, `skip` must be 0, otherwise a `ValueError` is raised. + max_read : float, default: None + amount, in seconds, of audio data to read from source. If read from + microphone, `max_read` should not be None, otherwise a `ValueError` is + raised. + audio_format, fmt : str + type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only + be used if `input` is a string path to an audio file. If not given, + audio type will be guessed from file name extension or from file + header. + sampling_rate, sr : int + sampling rate of audio data. Required if `input` is a raw audio file, + a `bytes` object or None (i.e., read from microphone). + sample_width, sw : int + number of bytes used to encode one audio sample, typically 1, 2 or 4. + Required for raw data, see `sampling_rate`. + channels, ch : int + number of channels of audio data. Required for raw data, see + `sampling_rate`. + large_file : bool, default: False + If True, AND if `input` is a path to a *wav* of a *raw* audio file + (and **only** these two formats) then audio file is not fully loaded to + memory in order to create the region (but the portion of data needed to + create the region is of course loaded to memory). Set to True if + `max_read` is significantly smaller then the size of a large audio file + that shouldn't be entirely loaded to memory. + + Returns + ------- + region: AudioRegion + + Raises + ------ + ValueError + raised if `input` is None (i.e., read data from microphone) and `skip` + != 0 or `input` is None `max_read` is None (meaning that when reading + from the microphone, no data should be skipped, and maximum amount of + data to read should be explicitly provided). + """ + return AudioRegion.load(input, skip, max_read, **kwargs) + + +def split( + input, + min_dur=0.2, + max_dur=5, + max_silence=0.3, + drop_trailing_silence=False, + strict_min_dur=False, + **kwargs +): + """ + Split audio data and return a generator of AudioRegions + + Parameters + ---------- + input : str, bytes, AudioSource, AudioReader, AudioRegion or None + input audio data. If str, it should be a path to an existing audio file. + "-" is interpreted as standard input. If bytes, input is considered as + raw audio data. If None, read audio from microphone. + Every object that is not an `AudioReader` will be transformed into an + `AudioReader` before processing. If it is an `str` that refers to a raw + audio file, `bytes` or None, audio parameters should be provided using + kwargs (i.e., `samplig_rate`, `sample_width` and `channels` or their + alias). + If `input` is str then audio format will be guessed from file extension. + `audio_format` (alias `fmt`) kwarg can also be given to specify audio + format explicitly. If none of these options is available, rely on + backend (currently only pydub is supported) to load data. + min_dur : float, default: 0.2 + minimun duration in seconds of a detected audio event. By using large + values for `min_dur`, very short audio events (e.g., very short 1-word + utterances like 'yes' or 'no') can be mis detected. Using very short + values might result in a high number of short, unuseful audio events. + max_dur : float, default: 5 + maximum duration in seconds of a detected audio event. If an audio event + lasts more than `max_dur` it will be truncated. If the continuation of a + truncated audio event is shorter than `min_dur` then this continuation + is accepted as a valid audio event if `strict_min_dur` is False. + Otherwise it is rejected. + max_silence : float, default: 0.3 + maximum duration of continuous silence within an audio event. There + might be many silent gaps of this duration within one audio event. If + the continuous silence happens at the end of the event than it's kept as + part of the event if `drop_trailing_silence` is False (default). + drop_trailing_silence : bool, default: False + Whether to remove trailing silence from detected events. To avoid abrupt + cuts in speech, trailing silence should be kept, therefore this + parameter should be False. + strict_min_dur : bool, default: False + strict minimum duration. Do not accept an audio event if it is shorter + than `min_dur` even if it is contiguous to the latest valid event. This + happens if the the latest detected event had reached `max_dur`. + + Other Parameters + ---------------- + analysis_window, aw : float, default: 0.05 (50 ms) + duration of analysis window in seconds. A value between 0.01 (10 ms) and + 0.1 (100 ms) should be good for most use-cases. + audio_format, fmt : str + type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be + used if `input` is a string path to an audio file. If not given, audio + type will be guessed from file name extension or from file header. + sampling_rate, sr : int + sampling rate of audio data. Required if `input` is a raw audio file, is + a bytes object or None (i.e., read from microphone). + sample_width, sw : int + number of bytes used to encode one audio sample, typically 1, 2 or 4. + Required for raw data, see `sampling_rate`. + channels, ch : int + number of channels of audio data. Required for raw data, see + `sampling_rate`. + use_channel, uc : {None, "mix"} or int + which channel to use for split if `input` has multiple audio channels. + Regardless of which channel is used for splitting, returned audio events + contain data from *all* channels, just as `input`. + The following values are accepted: + + - None (alias "any"): accept audio activity from any channel, even if + other channels are silent. This is the default behavior. + + - "mix" ("avg" or "average"): mix down all channels (i.e. compute + average channel) and split the resulting channel. + + - int (0 <=, > `channels`): use one channel, specified by integer id, + for split. + + large_file : bool, default: False + If True, AND if `input` is a path to a *wav* of a *raw* audio file + (and only these two formats) then audio data is lazily loaded to memory + (i.e., one analysis window a time). Otherwise the whole file is loaded + to memory before split. Set to True if the size of the file is larger + than available memory. + max_read, mr : float, default: None, read until end of stream + maximum data to read from source in seconds. + validator, val : callable, DataValidator + custom data validator. If `None` (default), an `AudioEnergyValidor` is + used with the given energy threshold. Can be a callable or an instance + of `DataValidator` that implements `is_valid`. In either case, it'll be + called with with a window of audio data as the first parameter. + energy_threshold, eth : float, default: 50 + energy threshold for audio activity detection. Audio regions that have + enough windows of with a signal energy equal to or above this threshold + are considered valid audio events. Here we are referring to this amount + as the energy of the signal but to be more accurate, it is the log + energy of computed as: `20 * log10(sqrt(dot(x, x) / len(x)))` (see + :class:`AudioEnergyValidator` and + :func:`calculate_energy_single_channel`). If `validator` is given, this + argument is ignored. + + Yields + ------ + AudioRegion + a generator of detected :class:`AudioRegion` s. + """ + if min_dur <= 0: + raise ValueError("'min_dur' ({}) must be > 0".format(min_dur)) + if max_dur <= 0: + raise ValueError("'max_dur' ({}) must be > 0".format(max_dur)) + if max_silence < 0: + raise ValueError("'max_silence' ({}) must be >= 0".format(max_silence)) + + if isinstance(input, AudioReader): + source = input + analysis_window = source.block_dur + else: + analysis_window = kwargs.get( + "analysis_window", kwargs.get("aw", DEFAULT_ANALYSIS_WINDOW) + ) + if analysis_window <= 0: + raise ValueError( + "'analysis_window' ({}) must be > 0".format(analysis_window) + ) + + params = kwargs.copy() + params["max_read"] = params.get("max_read", params.get("mr")) + params["audio_format"] = params.get("audio_format", params.get("fmt")) + if isinstance(input, AudioRegion): + params["sampling_rate"] = input.sr + params["sample_width"] = input.sw + params["channels"] = input.ch + input = bytes(input) + try: + source = AudioReader(input, block_dur=analysis_window, **params) + except TooSamllBlockDuration as exc: + err_msg = "Too small 'analysis_windows' ({0}) for sampling rate " + err_msg += "({1}). Analysis windows should at least be 1/{1} to " + err_msg += "cover one single data sample" + raise ValueError(err_msg.format(exc.block_dur, exc.sampling_rate)) + + validator = kwargs.get("validator", kwargs.get("val")) + if validator is None: + energy_threshold = kwargs.get( + "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD) + ) + use_channel = kwargs.get("use_channel", kwargs.get("uc")) + validator = AudioEnergyValidator( + energy_threshold, source.sw, source.ch, use_channel=use_channel + ) + mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0 + if strict_min_dur: + mode |= StreamTokenizer.STRICT_MIN_LENGTH + min_length = _duration_to_nb_windows(min_dur, analysis_window, math.ceil) + max_length = _duration_to_nb_windows( + max_dur, analysis_window, math.floor, _EPSILON + ) + max_continuous_silence = _duration_to_nb_windows( + max_silence, analysis_window, math.floor, _EPSILON + ) + + err_msg = "({0} sec.) results in {1} analysis window(s) " + err_msg += "({1} == {6}({0} / {2})) which is {5} the number " + err_msg += "of analysis window(s) for 'max_dur' ({3} == floor({4} / {2}))" + if min_length > max_length: + err_msg = "'min_dur' " + err_msg + raise ValueError( + err_msg.format( + min_dur, + min_length, + analysis_window, + max_length, + max_dur, + "higher than", + "ceil", + ) + ) + + if max_continuous_silence >= max_length: + err_msg = "'max_silence' " + err_msg + raise ValueError( + err_msg.format( + max_silence, + max_continuous_silence, + analysis_window, + max_length, + max_dur, + "higher or equal to", + "floor", + ) + ) + + tokenizer = StreamTokenizer( + validator, min_length, max_length, max_continuous_silence, mode=mode + ) + source.open() + token_gen = tokenizer.tokenize(source, generator=True) + region_gen = ( + _make_audio_region( + token[0], + token[1], + source.block_dur, + source.sr, + source.sw, + source.ch, + ) + for token in token_gen + ) + return region_gen + + +def _duration_to_nb_windows( + duration, analysis_window, round_fn=round, epsilon=0 +): + """ + Converts a given duration into a positive integer of analysis windows. + if `duration / analysis_window` is not an integer, the result will be + rounded to the closest bigger integer. If `duration == 0`, returns `0`. + If `duration < analysis_window`, returns 1. + `duration` and `analysis_window` can be in seconds or milliseconds but + must be in the same unit. + + Parameters + ---------- + duration : float + a given duration in seconds or ms. + analysis_window: float + size of analysis window, in the same unit as `duration`. + round_fn : callable + function called to round the result. Default: `round`. + epsilon : float + small value to add to the division result before rounding. + E.g., `0.3 / 0.1 = 2.9999999999999996`, when called with + `round_fn=math.floor` returns `2` instead of `3`. Adding a small value + to `0.3 / 0.1` avoids this error. + + Returns + ------- + nb_windows : int + minimum number of `analysis_window`'s to cover `durartion`. That means + that `analysis_window * nb_windows >= duration`. + """ + if duration < 0 or analysis_window <= 0: + err_msg = "'duration' ({}) must be >= 0 and 'analysis_window' ({}) > 0" + raise ValueError(err_msg.format(duration, analysis_window)) + if duration == 0: + return 0 + return int(round_fn(duration / analysis_window + epsilon)) + + +def _make_audio_region( + data_frames, + start_frame, + frame_duration, + sampling_rate, + sample_width, + channels, +): + """ + Helper function to create an `AudioRegion` from parameters returned by + tokenization object. It takes care of setting up region `start` and `end` + in metadata. + + Parameters + ---------- + frame_duration: float + duration of analysis window in seconds + start_frame : int + index of the fisrt analysis window + samling_rate : int + sampling rate of audio data + sample_width : int + number of bytes of one audio sample + channels : int + number of channels of audio data + + Returns + ------- + audio_region : AudioRegion + AudioRegion whose start time is calculeted as: + `1000 * start_frame * frame_duration` + """ + start = start_frame * frame_duration + data = b"".join(data_frames) + duration = len(data) / (sampling_rate * sample_width * channels) + meta = {"start": start, "end": start + duration} + return AudioRegion(data, sampling_rate, sample_width, channels, meta) + + +def _read_chunks_online(max_read, **kwargs): + """ + Helper function to read audio data from an online blocking source + (i.e., microphone). Used to build an `AudioRegion` and can intercept + KeyboardInterrupt so that reading stops as soon as this exception is + raised. Makes building `AudioRegion`s on [i]python sessions and jupyter + notebooks more user friendly. + + Parameters + ---------- + max_read : float + maximum amount of data to read in seconds. + kwargs : + audio parameters (sampling_rate, sample_width and channels). + + See also + -------- + `AudioRegion.build` + """ + reader = AudioReader(None, block_dur=0.5, max_read=max_read, **kwargs) + reader.open() + data = [] + try: + while True: + frame = reader.read() + if frame is None: + break + data.append(frame) + except KeyboardInterrupt: + # Stop data acquisition from microphone when pressing + # Ctrl+C on a [i]python session or a notebook + pass + reader.close() + return ( + b"".join(data), + reader.sampling_rate, + reader.sample_width, + reader.channels, + ) + + +def _read_offline(input, skip=0, max_read=None, **kwargs): + """ + Helper function to read audio data from an offline (i.e., file). Used to + build `AudioRegion`s. + + Parameters + ---------- + input : str, bytes + path to audio file (if str), or a bytes object representing raw audio + data. + skip : float, default 0 + amount of data to skip from the begining of audio source. + max_read : float, default: None + maximum amount of audio data to read. Default: None, means read until + end of stream. + kwargs : + audio parameters (sampling_rate, sample_width and channels). + + See also + -------- + `AudioRegion.build` + + """ + audio_source = get_audio_source(input, **kwargs) + audio_source.open() + if skip is not None and skip > 0: + skip_samples = round(skip * audio_source.sampling_rate) + audio_source.read(skip_samples) + if max_read is not None: + if max_read < 0: + max_read = None + else: + max_read = round(max_read * audio_source.sampling_rate) + data = audio_source.read(max_read) + audio_source.close() + return ( + data, + audio_source.sampling_rate, + audio_source.sample_width, + audio_source.channels, + ) + + +def _check_convert_index(index, types, err_msg): + if not isinstance(index, slice) or index.step is not None: + raise TypeError(err_msg) + start = index.start if index.start is not None else 0 + stop = index.stop + for index in (start, stop): + if index is not None and not isinstance(index, types): + raise TypeError(err_msg) + return start, stop + + +class _SecondsView: + """A class to create a view of `AudioRegion` that can be sliced using + indices in seconds. + """ + + def __init__(self, region): + self._region = region + + def __getitem__(self, index): + err_msg = "Slicing AudioRegion by seconds requires indices of type " + err_msg += "'int' or 'float' without a step (e.g. region.sec[7.5:10])" + start_s, stop_s = _check_convert_index(index, (int, float), err_msg) + sr = self._region.sampling_rate + start_sample = int(start_s * sr) + stop_sample = None if stop_s is None else round(stop_s * sr) + return self._region[start_sample:stop_sample] + + @property + def len(self): + """ + Return region duration in seconds. + """ + return self._region.duration + + +class _MillisView(_SecondsView): + """A class to create a view of `AudioRegion` that can be sliced using + indices in milliseconds. + """ + + def __getitem__(self, index): + err_msg = ( + "Slicing AudioRegion by milliseconds requires indices of type " + ) + err_msg += "'int' without a step (e.g. region.sec[500:1500])" + start_ms, stop_ms = _check_convert_index(index, (int), err_msg) + start_sec = start_ms / 1000 + stop_sec = None if stop_ms is None else stop_ms / 1000 + index = slice(start_sec, stop_sec) + return super(_MillisView, self).__getitem__(index) + + def __len__(self): + """ + Return region duration in milliseconds. + """ + return round(self._region.duration * 1000) + + @property + def len(self): + """ + Return region duration in milliseconds. + """ + return len(self) + + +class _AudioRegionMetadata(dict): + """A class to store `AudioRegion`'s metadata.""" + + def __getattr__(self, name): + if name in self: + return self[name] + else: + err_msg = "AudioRegion metadata has no entry '{}'" + raise AttributeError(err_msg.format(name)) + + def __setattr__(self, name, value): + self[name] = value + + def __str__(self): + return "\n".join("{}: {}".format(k, v) for k, v in self.items()) + + def __repr__(self): + return str(self) + + +class AudioRegion(object): + """ + AudioRegion encapsulates raw audio data and provides an interface to + perform simple operations on it. Use `AudioRegion.load` to build an + `AudioRegion` from different types of objects. + + Parameters + ---------- + data : bytes + raw audio data as a bytes object + sampling_rate : int + sampling rate of audio data + sample_width : int + number of bytes of one audio sample + channels : int + number of channels of audio data + meta : dict, default: None + any collection of <key:value> elements used to build metadata for + this `AudioRegion`. Meta data can be accessed via `region.meta.key` + if `key` is a valid python attribute name, or via `region.meta[key]` + if not. Note that the :func:`split` function (or the + :meth:`AudioRegion.split` method) returns `AudioRegions` with a ``start`` + and a ``stop`` meta values that indicate the location in seconds of the + region in original audio data. + + See also + -------- + AudioRegion.load + + """ + + def __init__(self, data, sampling_rate, sample_width, channels, meta=None): + check_audio_data(data, sample_width, channels) + self._data = data + self._sampling_rate = sampling_rate + self._sample_width = sample_width + self._channels = channels + self._samples = None + self.splitp = self.split_and_plot + + if meta is not None: + self._meta = _AudioRegionMetadata(meta) + else: + self._meta = None + + self._seconds_view = _SecondsView(self) + self.sec = self.seconds + self.s = self.seconds + + self._millis_view = _MillisView(self) + self.ms = self.millis + + @property + def meta(self): + return self._meta + + @meta.setter + def meta(self, new_meta): + """Meta data of audio region.""" + self._meta = _AudioRegionMetadata(new_meta) + + @classmethod + def load(cls, input, skip=0, max_read=None, **kwargs): + """ + Create an `AudioRegion` by loading data from `input`. See :func:`load` + for parameters descripion. + + Returns + ------- + region: AudioRegion + + Raises + ------ + ValueError + raised if `input` is None and `skip` != 0 or `max_read` is None. + """ + if input is None: + if skip > 0: + raise ValueError( + "'skip' should be 0 when reading from microphone" + ) + if max_read is None or max_read < 0: + raise ValueError( + "'max_read' should not be None when reading from " + "microphone" + ) + data, sampling_rate, sample_width, channels = _read_chunks_online( + max_read, **kwargs + ) + else: + data, sampling_rate, sample_width, channels = _read_offline( + input, skip=skip, max_read=max_read, **kwargs + ) + + return cls(data, sampling_rate, sample_width, channels) + + @property + def seconds(self): + """ + A view to slice audio region by seconds (using ``region.seconds[start:end]``). + """ + return self._seconds_view + + @property + def millis(self): + """A view to slice audio region by milliseconds (using ``region.millis[start:end]``).""" + return self._millis_view + + @property + def duration(self): + """ + Returns region duration in seconds. + """ + return len(self._data) / ( + self.sampling_rate * self.sample_width * self.channels + ) + + @property + def sampling_rate(self): + """Samling rate of audio data.""" + return self._sampling_rate + + @property + def sr(self): + """Samling rate of audio data, alias for `sampling_rate`.""" + return self._sampling_rate + + @property + def sample_width(self): + """Number of bytes per sample, one channel considered.""" + return self._sample_width + + @property + def sw(self): + """Number of bytes per sample, alias for `sampling_rate`.""" + return self._sample_width + + @property + def channels(self): + """Number of channels of audio data.""" + return self._channels + + @property + def ch(self): + """Number of channels of audio data, alias for `channels`.""" + return self._channels + + def play(self, progress_bar=False, player=None, **progress_bar_kwargs): + """ + Play audio region. + + Parameters + ---------- + progress_bar : bool, default: False + whether to use a progress bar while playing audio. Default: False. + `progress_bar` requires `tqdm`, if not installed, no progress bar + will be shown. + player : AudioPalyer, default: None + audio player to use. if None (default), use `player_for()` + to get a new audio player. + progress_bar_kwargs : kwargs + keyword arguments to pass to `tqdm` progress_bar builder (e.g., + use `leave=False` to clean up the screen when play finishes). + """ + if player is None: + player = player_for(self) + player.play( + self._data, progress_bar=progress_bar, **progress_bar_kwargs + ) + + def save(self, file, audio_format=None, exists_ok=True, **audio_parameters): + """ + Save audio region to file. -from auditok.util import DataValidator + Parameters + ---------- + file : str + path to output audio file. May contain `{duration}` placeholder + as well as any place holder that this region's metadata might + contain (e.g., regions returned by `split` contain metadata with + `start` and `end` attributes that can be used to build output file + name as `{meta.start}` and `{meta.end}`. See examples using + placeholders with formatting. -__all__ = ["StreamTokenizer"] + audio_format : str, default: None + format used to save audio data. If None (default), format is guessed + from file name's extension. If file name has no extension, audio + data is saved as a raw (headerless) audio file. + exists_ok : bool, default: True + If True, overwrite `file` if a file with the same name exists. + If False, raise an `IOError` if `file` exists. + audio_parameters: dict + any keyword arguments to be passed to audio saving backend. + Returns + ------- + file: str + name of output file with replaced placehoders. + Raises + IOError if `file` exists and `exists_ok` is False. -class StreamTokenizer(): + + Examples + -------- + >>> region = AudioRegion(b'\\0' * 2 * 24000, + >>> sampling_rate=16000, + >>> sample_width=2, + >>> channels=1) + >>> region.meta.start = 2.25 + >>> region.meta.end = 2.25 + region.duration + >>> region.save('audio_{meta.start}-{meta.end}.wav') + >>> audio_2.25-3.75.wav + >>> region.save('region_{meta.start:.3f}_{duration:.3f}.wav') + audio_2.250_1.500.wav + """ + if isinstance(file, str): + file = file.format(duration=self.duration, meta=self.meta) + if not exists_ok and os.path.exists(file): + raise FileExistsError("file '{file}' exists".format(file=file)) + to_file( + self._data, + file, + audio_format, + sr=self.sr, + sw=self.sw, + ch=self.ch, + audio_parameters=audio_parameters, + ) + return file + + def split( + self, + min_dur=0.2, + max_dur=5, + max_silence=0.3, + drop_trailing_silence=False, + strict_min_dur=False, + **kwargs + ): + """Split audio region. See :func:`auditok.split()` for a comprehensive + description of split parameters. + See Also :meth:`AudioRegio.split_and_plot`. + """ + if kwargs.get("max_read", kwargs.get("mr")) is not None: + warn_msg = "'max_read' (or 'mr') should not be used with " + warn_msg += "AudioRegion.split_and_plot(). You should rather " + warn_msg += "slice audio region before calling this method" + raise RuntimeWarning(warn_msg) + return split( + self, + min_dur=min_dur, + max_dur=max_dur, + max_silence=max_silence, + drop_trailing_silence=drop_trailing_silence, + strict_min_dur=strict_min_dur, + **kwargs + ) + + def plot( + self, + scale_signal=True, + show=True, + figsize=None, + save_as=None, + dpi=120, + theme="auditok", + ): + """Plot audio region, one sub-plot for each channel. + + Parameters + ---------- + scale_signal : bool, default: True + if true, scale signal by subtracting its mean and dividing by its + standard deviation before plotting. + show : bool + whether to show plotted signal right after the call. + figsize : tuple, default: None + width and height of the figure to pass to `matplotlib`. + save_as : str, default None. + if provided, also save plot to file. + dpi : int, default: 120 + plot dpi to pass to `matplotlib`. + theme : str or dict, default: "auditok" + plot theme to use. Currently only "auditok" theme is implemented. To + provide you own them see :attr:`auditok.plotting.AUDITOK_PLOT_THEME`. + """ + try: + from auditok.plotting import plot + + plot( + self, + scale_signal=scale_signal, + show=show, + figsize=figsize, + save_as=save_as, + dpi=dpi, + theme=theme, + ) + except ImportError: + raise RuntimeWarning("Plotting requires matplotlib") + + def split_and_plot( + self, + min_dur=0.2, + max_dur=5, + max_silence=0.3, + drop_trailing_silence=False, + strict_min_dur=False, + scale_signal=True, + show=True, + figsize=None, + save_as=None, + dpi=120, + theme="auditok", + **kwargs + ): + """Split region and plot signal and detections. Alias: :meth:`splitp`. + See :func:`auditok.split()` for a comprehensive description of split + parameters. Also see :meth:`plot` for plot parameters. + """ + try: + from auditok.plotting import plot + + regions = self.split( + min_dur=min_dur, + max_dur=max_dur, + max_silence=max_silence, + drop_trailing_silence=drop_trailing_silence, + strict_min_dur=strict_min_dur, + **kwargs + ) + regions = list(regions) + detections = ((reg.meta.start, reg.meta.end) for reg in regions) + eth = kwargs.get( + "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD) + ) + plot( + self, + scale_signal=scale_signal, + detections=detections, + energy_threshold=eth, + show=show, + figsize=figsize, + save_as=save_as, + dpi=dpi, + theme=theme, + ) + return regions + except ImportError: + raise RuntimeWarning("Plotting requires matplotlib") + + def __array__(self): + return self.samples + + @property + def samples(self): + """Audio region as arrays of samples, one array per channel.""" + if self._samples is None: + self._samples = signal.to_array( + self._data, self.sample_width, self.channels + ) + return self._samples + + def __len__(self): + """ + Return region length in number of samples. + """ + return len(self._data) // (self.sample_width * self.channels) + + @property + def len(self): + """ + Return region length in number of samples. + """ + return len(self) + + def __bytes__(self): + return self._data + + def __str__(self): + return ( + "AudioRegion(duration={:.3f}, " + "sampling_rate={}, sample_width={}, channels={})".format( + self.duration, self.sr, self.sw, self.ch + ) + ) + + def __repr__(self): + return str(self) + + def __add__(self, other): + """ + Concatenates this region and `other` and return a new region. + Both regions must have the same sampling rate, sample width + and number of channels. If not, raises a `ValueError`. + """ + if not isinstance(other, AudioRegion): + raise TypeError( + "Can only concatenate AudioRegion, " + 'not "{}"'.format(type(other)) + ) + if other.sr != self.sr: + raise ValueError( + "Can only concatenate AudioRegions of the same " + "sampling rate ({} != {})".format(self.sr, other.sr) + ) + if other.sw != self.sw: + raise ValueError( + "Can only concatenate AudioRegions of the same " + "sample width ({} != {})".format(self.sw, other.sw) + ) + if other.ch != self.ch: + raise ValueError( + "Can only concatenate AudioRegions of the same " + "number of channels ({} != {})".format(self.ch, other.ch) + ) + data = self._data + other._data + return AudioRegion(data, self.sr, self.sw, self.ch) + + def __radd__(self, other): + """ + Concatenates `other` and this region. `other` should be an + `AudioRegion` with the same audio parameters as this region + but can exceptionally be `0` to make it possible to concatenate + many regions with `sum`. + """ + if other == 0: + return self + return other.add(self) + + def __mul__(self, n): + if not isinstance(n, int): + err_msg = "Can't multiply AudioRegion by a non-int of type '{}'" + raise TypeError(err_msg.format(type(n))) + data = self._data * n + return AudioRegion(data, self.sr, self.sw, self.ch) + + def __rmul__(self, n): + return self * n + + def __truediv__(self, n): + if not isinstance(n, int) or n <= 0: + raise TypeError("AudioRegion can only be divided by a positive int") + samples_per_sub_region, rest = divmod(len(self), n) + onset = 0 + sub_regions = [] + while onset < len(self): + offset = 0 + if rest > 0: + offset = 1 + rest -= 1 + offset += onset + samples_per_sub_region + sub_regions.append(self[onset:offset]) + onset = offset + return sub_regions + + def __eq__(self, other): + if other is self: + return True + if not isinstance(other, AudioRegion): + return False + return ( + (self._data == other._data) + and (self.sr == other.sr) + and (self.sw == other.sw) + and (self.ch == other.ch) + ) + + def __getitem__(self, index): + err_msg = "Slicing AudioRegion by samples requires indices of type " + err_msg += "'int' without a step (e.g. region.sec[1600:3200])" + start_sample, stop_sample = _check_convert_index(index, (int), err_msg) + + bytes_per_sample = self.sample_width * self.channels + len_samples = len(self._data) // bytes_per_sample + + if start_sample < 0: + start_sample = max(start_sample + len_samples, 0) + onset = start_sample * bytes_per_sample + + if stop_sample is not None: + if stop_sample < 0: + stop_sample = max(stop_sample + len_samples, 0) + offset = index.stop * bytes_per_sample + else: + offset = None + + data = self._data[onset:offset] + return AudioRegion(data, self.sr, self.sw, self.ch) + + +class StreamTokenizer: """ Class for stream tokenizers. It implements a 4-state automaton scheme to extract sub-sequences of interest on the fly. - - :Parameters: - - `validator` : - instance of `DataValidator` that implements `is_valid` method. - - `min_length` : *(int)* - Minimum number of frames of a valid token. This includes all \ - tolerated non valid frames within the token. - - `max_length` : *(int)* - Maximum number of frames of a valid token. This includes all \ - tolerated non valid frames within the token. - - `max_continuous_silence` : *(int)* - Maximum number of consecutive non-valid frames within a token. - Note that, within a valid token, there may be many tolerated \ - *silent* regions that contain each a number of non valid frames up to \ - `max_continuous_silence` - - `init_min` : *(int, default=0)* - Minimum number of consecutive valid frames that must be **initially** \ - gathered before any sequence of non valid frames can be tolerated. This - option is not always needed, it can be used to drop non-valid tokens as - early as possible. **Default = 0** means that the option is by default - ineffective. - - `init_max_silence` : *(int, default=0)* - Maximum number of tolerated consecutive non-valid frames if the \ - number already gathered valid frames has not yet reached 'init_min'. - This argument is normally used if `init_min` is used. **Default = 0**, - by default this argument is not taken into consideration. - - `mode` : *(int, default=0)* - `mode` can be: - - 1. `StreamTokenizer.STRICT_MIN_LENGTH`: - if token *i* is delivered because `max_length` - is reached, and token *i+1* is immediately adjacent to - token *i* (i.e. token *i* ends at frame *k* and token *i+1* starts - at frame *k+1*) then accept token *i+1* only of it has a size of at - least `min_length`. The default behavior is to accept token *i+1* - event if it is shorter than `min_length` (given that the above conditions - are fulfilled of course). - - :Examples: - - In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is - accepted although it is shorter than `min_length` (3), because it immediately - follows the latest delivered token: - - .. code:: python - - from auditok import StreamTokenizer, StringDataSource, DataValidator - - class UpperCaseChecker(DataValidator): - def is_valid(self, frame): - return frame.isupper() - - - dsource = StringDataSource("aaaAAAABBbbb") - tokenizer = StreamTokenizer(validator=UpperCaseChecker(), - min_length=3, - max_length=4, - max_continuous_silence=0) - - tokenizer.tokenize(dsource) - - - :output: - - .. code:: python - - [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)] - - - The following tokenizer will however reject the 'BB' token: - - .. code:: python - - dsource = StringDataSource("aaaAAAABBbbb") - tokenizer = StreamTokenizer(validator=UpperCaseChecker(), - min_length=3, max_length=4, - max_continuous_silence=0, - mode=StreamTokenizer.STRICT_MIN_LENGTH) - tokenizer.tokenize(dsource) - - :output: - - .. code:: python - - [(['A', 'A', 'A', 'A'], 3, 6)] - - - 2. `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing non-valid frames - from a token to be delivered if and only if it is not **truncated**. - This can be a bit tricky. A token is actually delivered if: - - - a. `max_continuous_silence` is reached - - :or: - - - b. Its length reaches `max_length`. This is called a **truncated** token - - In the current implementation, a `StreamTokenizer`'s decision is only based on already seen - data and on incoming data. Thus, if a token is truncated at a non-valid but tolerated - frame (`max_length` is reached but `max_continuous_silence` not yet) any tailing - silence will be kept because it can potentially be part of valid token (if `max_length` - was bigger). But if `max_continuous_silence` is reached before `max_length`, the delivered - token will not be considered as truncated but a result of *normal* end of detection - (i.e. no more valid data). In that case the tailing silence can be removed if you use - the `StreamTokenizer.DROP_TRAILING_SILENCE` mode. - - :Example: - - .. code:: python - - tokenizer = StreamTokenizer(validator=UpperCaseChecker(), min_length=3, - max_length=6, max_continuous_silence=3, - mode=StreamTokenizer.DROP_TRAILING_SILENCE) - - dsource = StringDataSource("aaaAAAaaaBBbbbb") - tokenizer.tokenize(dsource) - - :output: - - .. code:: python - - [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)] - - The first token is delivered with its tailing silence because it is truncated - while the second one has its tailing frames removed. - - Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be: - - .. code:: python - - [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13)] - - - - 3. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`: - use both options. That means: first remove tailing silence, then ckeck if the - token still has at least a length of `min_length`. + + Parameters + ---------- + validator : callable, DataValidator (must implement `is_valid`) + called with each data frame read from source. Should take one positional + argument and return True or False for valid and invalid frames + respectively. + + min_length : int + Minimum number of frames of a valid token. This includes all + tolerated non valid frames within the token. + + max_length : int + Maximum number of frames of a valid token. This includes all + tolerated non valid frames within the token. + + max_continuous_silence : int + Maximum number of consecutive non-valid frames within a token. + Note that, within a valid token, there may be many tolerated + *silent* regions that contain each a number of non valid frames up + to `max_continuous_silence` + + init_min : int + Minimum number of consecutive valid frames that must be + **initially** gathered before any sequence of non valid frames can + be tolerated. This option is not always needed, it can be used to + drop non-valid tokens as early as possible. **Default = 0** means + that the option is by default ineffective. + + init_max_silence : int + Maximum number of tolerated consecutive non-valid frames if the + number already gathered valid frames has not yet reached + 'init_min'.This argument is normally used if `init_min` is used. + **Default = 0**, by default this argument is not taken into + consideration. + + mode : int + mode can be one of the following: + + -1 `StreamTokenizer.NORMAL` : do not drop trailing silence, and + accept a token shorter than `min_length` if it is the continuation + of the latest delivered token. + + -2 `StreamTokenizer.STRICT_MIN_LENGTH`: if token `i` is delivered + because `max_length` is reached, and token `i+1` is immediately + adjacent to token `i` (i.e. token `i` ends at frame `k` and token + `i+1` starts at frame `k+1`) then accept token `i+1` only of it has + a size of at least `min_length`. The default behavior is to accept + token `i+1` event if it is shorter than `min_length` (provided that + the above conditions are fulfilled of course). + + -3 `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing + non-valid frames from a token to be delivered if and only if it + is not **truncated**. This can be a bit tricky. A token is actually + delivered if: + + - `max_continuous_silence` is reached. + + - Its length reaches `max_length`. This is referred to as a + **truncated** token. + + In the current implementation, a `StreamTokenizer`'s decision is only + based on already seen data and on incoming data. Thus, if a token is + truncated at a non-valid but tolerated frame (`max_length` is reached + but `max_continuous_silence` not yet) any tailing silence will be kept + because it can potentially be part of valid token (if `max_length` was + bigger). But if `max_continuous_silence` is reached before + `max_length`, the delivered token will not be considered as truncated + but a result of *normal* end of detection (i.e. no more valid data). + In that case the trailing silence can be removed if you use the + `StreamTokenizer.DROP_TRAILING_SILENCE` mode. + + -4 `(StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE)`: + use both options. That means: first remove tailing silence, then + check if the token still has a length of at least `min_length`. + + + + + Examples + -------- + + In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is + accepted although it is shorter than `min_length` (3), because it + immediately follows the latest delivered token: + + >>> from auditok.core import StreamTokenizer + >>> from StringDataSource, DataValidator + + >>> class UpperCaseChecker(DataValidator): + >>> def is_valid(self, frame): + return frame.isupper() + >>> dsource = StringDataSource("aaaAAAABBbbb") + >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(), + min_length=3, + max_length=4, + max_continuous_silence=0) + >>> tokenizer.tokenize(dsource) + [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)] + + + The following tokenizer will however reject the 'BB' token: + + >>> dsource = StringDataSource("aaaAAAABBbbb") + >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(), + min_length=3, max_length=4, + max_continuous_silence=0, + mode=StreamTokenizer.STRICT_MIN_LENGTH) + >>> tokenizer.tokenize(dsource) + [(['A', 'A', 'A', 'A'], 3, 6)] + + + + >>> tokenizer = StreamTokenizer( + >>> validator=UpperCaseChecker(), + >>> min_length=3, + >>> max_length=6, + >>> max_continuous_silence=3, + >>> mode=StreamTokenizer.DROP_TRAILING_SILENCE + >>> ) + >>> dsource = StringDataSource("aaaAAAaaaBBbbbb") + >>> tokenizer.tokenize(dsource) + [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)] + + The first token is delivered with its tailing silence because it is + truncated while the second one has its tailing frames removed. + + Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be: + + .. code:: python + + [ + (['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), + (['B', 'B', 'b', 'b', 'b'], 9, 13) + ] + """ - - + SILENCE = 0 POSSIBLE_SILENCE = 1 - POSSIBLE_NOISE = 2 + POSSIBLE_NOISE = 2 NOISE = 3 - + NORMAL = 0 STRICT_MIN_LENGTH = 2 DROP_TRAILING_SILENCE = 4 - # alias - DROP_TAILING_SILENCE = 4 - - def __init__(self, validator, - min_length, max_length, max_continuous_silence, - init_min=0, init_max_silence=0, - mode=0): - - if not isinstance(validator, DataValidator): - raise TypeError("'validator' must be an instance of 'DataValidator'") - + + def __init__( + self, + validator, + min_length, + max_length, + max_continuous_silence, + init_min=0, + init_max_silence=0, + mode=0, + ): + if callable(validator): + self._is_valid = validator + elif isinstance(validator, DataValidator): + self._is_valid = validator.is_valid + else: + raise TypeError( + "'validator' must be a callable or an instance of " + "DataValidator" + ) + if max_length <= 0: - raise ValueError("'max_length' must be > 0 (value={0})".format(max_length)) - + raise ValueError( + "'max_length' must be > 0 (value={0})".format(max_length) + ) + if min_length <= 0 or min_length > max_length: - raise ValueError("'min_length' must be > 0 and <= 'max_length' (value={0})".format(min_length)) - + err_msg = "'min_length' must be > 0 and <= 'max_length' (value={0})" + raise ValueError(err_msg.format(min_length)) + if max_continuous_silence >= max_length: - raise ValueError("'max_continuous_silence' must be < 'max_length' (value={0})".format(max_continuous_silence)) - + err_msg = "'max_continuous_silence' must be < 'max_length' " + err_msg += "(value={0})" + raise ValueError(err_msg.format(max_continuous_silence)) + if init_min >= max_length: - raise ValueError("'init_min' must be < 'max_length' (value={0})".format(max_continuous_silence)) - + raise ValueError( + "'init_min' must be < 'max_length' (value={0})".format( + max_continuous_silence + ) + ) + self.validator = validator self.min_length = min_length self.max_length = max_length self.max_continuous_silence = max_continuous_silence self.init_min = init_min self.init_max_silent = init_max_silence - - self._mode = None - self.set_mode(mode) - self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0 - self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0 - + self._set_mode(mode) self._deliver = None self._tokens = None self._state = None self._data = None self._contiguous_token = False - self._init_count = 0 self._silence_length = 0 self._start_frame = 0 self._current_frame = 0 - - def set_mode(self, mode): - """ - :Parameters: - - `mode` : *(int)* - New mode, must be one of: - - - - `StreamTokenizer.STRICT_MIN_LENGTH` - - - `StreamTokenizer.DROP_TRAILING_SILENCE` - - - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE` - - - `0` - - See `StreamTokenizer.__init__` for more information about the mode. - """ - - if not mode in [self.STRICT_MIN_LENGTH, self.DROP_TRAILING_SILENCE, - self.STRICT_MIN_LENGTH | self.DROP_TRAILING_SILENCE, 0]: - + + def _set_mode(self, mode): + strict_min_and_drop_trailing = StreamTokenizer.STRICT_MIN_LENGTH + strict_min_and_drop_trailing |= StreamTokenizer.DROP_TRAILING_SILENCE + if mode not in [ + StreamTokenizer.NORMAL, + StreamTokenizer.STRICT_MIN_LENGTH, + StreamTokenizer.DROP_TRAILING_SILENCE, + strict_min_and_drop_trailing, + ]: raise ValueError("Wrong value for mode") - self._mode = mode self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0 - self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0 - - - def get_mode(self): - """ - Return the current mode. To check whether a specific mode is activated use - the bitwise 'and' operator `&`. Example: - - .. code:: python - - if mode & self.STRICT_MIN_LENGTH != 0: - do_something() - """ - return self._mode - + self._drop_trailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0 + def _reinitialize(self): self._contiguous_token = False self._data = [] @@ -266,112 +1269,114 @@ class StreamTokenizer(): self._state = self.SILENCE self._current_frame = -1 self._deliver = self._append_token - - - def tokenize(self, data_source, callback=None): + + def tokenize(self, data_source, callback=None, generator=False): """ - Read data from `data_source`, one frame a time, and process the read frames in - order to detect sequences of frames that make up valid tokens. - + Read data from `data_source`, one frame a time, and process the read + frames in order to detect sequences of frames that make up valid + tokens. + :Parameters: - `data_source` : instance of the :class:`DataSource` class that implements a `read` method. - 'read' should return a slice of signal, i.e. frame (of whatever \ - type as long as it can be processed by validator) and None if \ - there is no more signal. - + `data_source` : instance of the :class:`DataSource` class that + implements a `read` method. 'read' should return a slice of + signal, i.e. frame (of whatever type as long as it can be + processed by validator) and None if there is no more signal. + `callback` : an optional 3-argument function. - If a `callback` function is given, it will be called each time a valid token - is found. - - + If a `callback` function is given, it will be called each time + a valid token is found. + + :Returns: - A list of tokens if `callback` is None. Each token is tuple with the following elements: - + A list of tokens if `callback` is None. Each token is tuple with the + following elements: + .. code python - + (data, start, end) - - where `data` is a list of read frames, `start`: index of the first frame in the - original data and `end` : index of the last frame. - + + where `data` is a list of read frames, `start`: index of the first + frame in the original data and `end` : index of the last frame. """ - + token_gen = self._iter_tokens(data_source) + if callback: + for token in token_gen: + callback(*token) + return + if generator: + return token_gen + return list(token_gen) + + def _iter_tokens(self, data_source): self._reinitialize() - - if callback is not None: - self._deliver = callback - while True: - frame = data_source.read() + frame = data_source.read() + self._current_frame += 1 if frame is None: + token = self._post_process() + if token is not None: + yield token break - self._current_frame += 1 - self._process(frame) - - self._post_process() - - if callback is None: - _ret = self._tokens - self._tokens = None - return _ret - - - def _process(self, frame): - - frame_is_valid = self.validator.is_valid(frame) - + token = self._process(frame) + if token is not None: + yield token + + def _process(self, frame): # noqa: C901 + + frame_is_valid = self._is_valid(frame) + if self._state == self.SILENCE: - + if frame_is_valid: # seems we got a valid frame after a silence self._init_count = 1 self._silence_length = 0 self._start_frame = self._current_frame self._data.append(frame) - - if self._init_count >= self.init_min: + + if self._init_count >= self.init_min: self._state = self.NOISE if len(self._data) >= self.max_length: - self._process_end_of_detection(True) + return self._process_end_of_detection(True) else: self._state = self.POSSIBLE_NOISE - + elif self._state == self.POSSIBLE_NOISE: - + if frame_is_valid: self._silence_length = 0 self._init_count += 1 self._data.append(frame) - if self._init_count >= self.init_min: + if self._init_count >= self.init_min: self._state = self.NOISE if len(self._data) >= self.max_length: - self._process_end_of_detection(True) - - else: + return self._process_end_of_detection(True) + + else: self._silence_length += 1 - if self._silence_length > self.init_max_silent or \ - len(self._data) + 1 >= self.max_length: + if ( + self._silence_length > self.init_max_silent + or len(self._data) + 1 >= self.max_length + ): # either init_max_silent or max_length is reached # before _init_count, back to silence self._data = [] self._state = self.SILENCE else: self._data.append(frame) - - + elif self._state == self.NOISE: - + if frame_is_valid: self._data.append(frame) if len(self._data) >= self.max_length: - self._process_end_of_detection(True) - - elif self.max_continuous_silence <= 0 : - # max token reached at this frame will _deliver if _contiguous_token - # and not _strict_min_length - self._process_end_of_detection() + return self._process_end_of_detection(True) + + elif self.max_continuous_silence <= 0: + # max token reached at this frame will _deliver if + # _contiguous_token and not _strict_min_length self._state = self.SILENCE - + return self._process_end_of_detection() else: # this is the first silent frame following a valid one # and it is tolerated @@ -379,61 +1384,63 @@ class StreamTokenizer(): self._data.append(frame) self._state = self.POSSIBLE_SILENCE if len(self._data) == self.max_length: - self._process_end_of_detection(True) - # don't reset _silence_length because we still + return self._process_end_of_detection(True) + # don't reset _silence_length because we still # need to know the total number of silent frames - - - + elif self._state == self.POSSIBLE_SILENCE: - + if frame_is_valid: self._data.append(frame) self._silence_length = 0 self._state = self.NOISE if len(self._data) >= self.max_length: - self._process_end_of_detection(True) - + return self._process_end_of_detection(True) + else: if self._silence_length >= self.max_continuous_silence: - if self._silence_length < len(self._data): - # _deliver only gathered frames aren't all silent - self._process_end_of_detection() - else: - self._data = [] self._state = self.SILENCE + if self._silence_length < len(self._data): + # _deliver only gathered frames aren't all silent + return self._process_end_of_detection() + self._data = [] self._silence_length = 0 else: self._data.append(frame) self._silence_length += 1 if len(self._data) >= self.max_length: - self._process_end_of_detection(True) - # don't reset _silence_length because we still + return self._process_end_of_detection(True) + # don't reset _silence_length because we still # need to know the total number of silent frames - - + def _post_process(self): if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE: if len(self._data) > 0 and len(self._data) > self._silence_length: - self._process_end_of_detection() - - + return self._process_end_of_detection() + def _process_end_of_detection(self, truncated=False): - - if not truncated and self._drop_tailing_silence and self._silence_length > 0: + + if ( + not truncated + and self._drop_trailing_silence + and self._silence_length > 0 + ): # happens if max_continuous_silence is reached # or max_length is reached at a silent frame - self._data = self._data[0: - self._silence_length] - - if (len(self._data) >= self.min_length) or \ - (len(self._data) > 0 and \ - not self._strict_min_length and self._contiguous_token): - - - - _end_frame = self._start_frame + len(self._data) - 1 - self._deliver(self._data, self._start_frame, _end_frame) - + self._data = self._data[0 : -self._silence_length] + + if (len(self._data) >= self.min_length) or ( + len(self._data) > 0 + and not self._strict_min_length + and self._contiguous_token + ): + + start_frame = self._start_frame + end_frame = self._start_frame + len(self._data) - 1 + data = self._data + self._data = [] + token = (data, start_frame, end_frame) + if truncated: # next token (if any) will start at _current_frame + 1 self._start_frame = self._current_frame + 1 @@ -441,12 +1448,11 @@ class StreamTokenizer(): self._contiguous_token = True else: self._contiguous_token = False + return token else: - self._contiguous_token = False - + self._contiguous_token = False + self._data = [] - - - + def _append_token(self, data, start, end): self._tokens.append((data, start, end)) diff --git a/libs/auditok/dataset.py b/libs/auditok/dataset.py index 1a3a7af5c..98dc5d1d4 100644 --- a/libs/auditok/dataset.py +++ b/libs/auditok/dataset.py @@ -1,19 +1,31 @@ """ -This module contains links to audio files you can use for test purposes. +This module contains links to audio files that can be used for test purposes. + +.. autosummary:: + :toctree: generated/ + + one_to_six_arabic_16000_mono_bc_noise + was_der_mensch_saet_mono_44100_lead_trail_silence """ import os -__all__ = ["one_to_six_arabic_16000_mono_bc_noise", "was_der_mensch_saet_mono_44100_lead_trail_silence"] +__all__ = [ + "one_to_six_arabic_16000_mono_bc_noise", + "was_der_mensch_saet_mono_44100_lead_trail_silence", +] _current_dir = os.path.dirname(os.path.realpath(__file__)) one_to_six_arabic_16000_mono_bc_noise = "{cd}{sep}data{sep}1to6arabic_\ -16000_mono_bc_noise.wav".format(cd=_current_dir, sep=os.path.sep) +16000_mono_bc_noise.wav".format( + cd=_current_dir, sep=os.path.sep +) """A wave file that contains a pronunciation of Arabic numbers from 1 to 6""" - was_der_mensch_saet_mono_44100_lead_trail_silence = "{cd}{sep}data{sep}was_\ der_mensch_saet_das_wird_er_vielfach_ernten_44100Hz_mono_lead_trail_\ -silence.wav".format(cd=_current_dir, sep=os.path.sep) -""" A wave file that contains a sentence between long leading and trailing periods of silence"""
\ No newline at end of file +silence.wav".format( + cd=_current_dir, sep=os.path.sep +) +"""A wave file that contains a sentence with a long leading and trailing silence""" diff --git a/libs/auditok/exceptions.py b/libs/auditok/exceptions.py index 0026a9d89..7bc5054ee 100644 --- a/libs/auditok/exceptions.py +++ b/libs/auditok/exceptions.py @@ -1,9 +1,41 @@ -""" -November 2015 -@author: Amine SEHILI <[email protected]> -""" - class DuplicateArgument(Exception): pass +class TooSamllBlockDuration(ValueError): + """Raised when block_dur results in a block_size smaller than one sample.""" + + def __init__(self, message, block_dur, sampling_rate): + self.block_dur = block_dur + self.sampling_rate = sampling_rate + super(TooSamllBlockDuration, self).__init__(message) + + +class TimeFormatError(Exception): + """Raised when a duration formatting directive is unknown.""" + + +class EndOfProcessing(Exception): + """Raised within command line script's main function to jump to + postprocessing code.""" + + +class AudioIOError(Exception): + """Raised when a compressed audio file cannot be loaded or when trying + to read from a not yet open AudioSource""" + + +class AudioParameterError(AudioIOError): + """Raised when one audio parameter is missing when loading raw data or + saving data to a format other than raw. Also raised when an audio + parameter has a wrong value.""" + + +class AudioEncodingError(Exception): + """Raised if audio data can not be encoded in the provided format""" + + +class AudioEncodingWarning(RuntimeWarning): + """Raised if audio data can not be encoded in the provided format + but saved as wav. + """ diff --git a/libs/auditok/io.py b/libs/auditok/io.py index 665ab274d..b5fb61a76 100644 --- a/libs/auditok/io.py +++ b/libs/auditok/io.py @@ -1,499 +1,1021 @@ """ Module for low-level audio input-output operations. -Class summary -============= - .. autosummary:: + :toctree: generated/ - AudioSource - Rewindable - BufferAudioSource - WaveAudioSource - PyAudioSource - StdinAudioSource - PyAudioPlayer - + AudioSource + Rewindable + BufferAudioSource + WaveAudioSource + PyAudioSource + StdinAudioSource + PyAudioPlayer + from_file + to_file + player_for +""" +import os +import sys +import wave +import warnings +from abc import ABC, abstractmethod +from functools import partial +from .exceptions import AudioIOError, AudioParameterError -Function summary -================ +try: + from pydub import AudioSegment -.. autosummary:: + _WITH_PYDUB = True +except ImportError: + _WITH_PYDUB = False - from_file - player_for -""" +try: + from tqdm import tqdm as _tqdm -from abc import ABCMeta, abstractmethod -import wave -import sys + DEFAULT_BAR_FORMAT_TQDM = "|" + "{bar}" + "|" + "[{elapsed}/{duration}]" + DEFAULT_NCOLS_TQDM = 30 + DEFAULT_NCOLS_TQDM = 30 + DEFAULT_MIN_INTERVAL_TQDM = 0.05 + _WITH_TQDM = True +except ImportError: + _WITH_TQDM = False -__all__ = ["AudioSource", "Rewindable", "BufferAudioSource", "WaveAudioSource", - "PyAudioSource", "StdinAudioSource", "PyAudioPlayer", "from_file", "player_for"] -DEFAULT_SAMPLE_RATE = 16000 +__all__ = [ + "AudioSource", + "Rewindable", + "BufferAudioSource", + "RawAudioSource", + "WaveAudioSource", + "PyAudioSource", + "StdinAudioSource", + "PyAudioPlayer", + "from_file", + "to_file", + "player_for", +] + +DEFAULT_SAMPLING_RATE = 16000 DEFAULT_SAMPLE_WIDTH = 2 DEFAULT_NB_CHANNELS = 1 -class AudioSource(): - """ +def check_audio_data(data, sample_width, channels): + sample_size_bytes = int(sample_width * channels) + nb_samples = len(data) // sample_size_bytes + if nb_samples * sample_size_bytes != len(data): + raise AudioParameterError( + "The length of audio data must be an integer " + "multiple of `sample_width * channels`" + ) + + +def _guess_audio_format(fmt, filename): + if fmt is None: + extension = os.path.splitext(filename.lower())[1][1:] + if extension: + fmt = extension + else: + return None + fmt = fmt.lower() + if fmt == "wave": + fmt = "wav" + return fmt + + +def _get_audio_parameters(param_dict): + """ + Get audio parameters from a dictionary of parameters. An audio parameter can + have a long name or a short name. If the long name is present, the short + name will be ignored. If neither is present then `AudioParameterError` is + raised. + + Expected parameters are: + + - `sampling_rate`, `sr` : int, sampling rate. + + - `sample_width`, `sw` : int, sample size in bytes. + + - `channels`, `ch` : int, number of channels. + + Returns + ------- + audio_parameters : tuple + a tuple for audio parameters as (sampling_rate, sample_width, channels). + """ + err_message = ( + "'{ln}' (or '{sn}') must be a positive integer, found: '{val}'" + ) + parameters = [] + for (long_name, short_name) in ( + ("sampling_rate", "sr"), + ("sample_width", "sw"), + ("channels", "ch"), + ): + param = param_dict.get(long_name, param_dict.get(short_name)) + if param is None or not isinstance(param, int) or param <= 0: + raise AudioParameterError( + err_message.format(ln=long_name, sn=short_name, val=param) + ) + parameters.append(param) + sampling_rate, sample_width, channels = parameters + return sampling_rate, sample_width, channels + + +class AudioSource(ABC): + """ Base class for audio source objects. - - Subclasses should implement methods to open/close and audio stream + + Subclasses should implement methods to open/close and audio stream and read the desired amount of audio samples. - - :Parameters: - - `sampling_rate` : int - Number of samples per second of audio stream. Default = 16000. - - `sample_width` : int - Size in bytes of one audio sample. Possible values : 1, 2, 4. - Default = 2. - - `channels` : int - Number of channels of audio stream. The current version supports - only mono audio streams (i.e. one channel). - """ - - __metaclass__ = ABCMeta - - def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE, - sample_width = DEFAULT_SAMPLE_WIDTH, - channels = DEFAULT_NB_CHANNELS): - - if not sample_width in (1, 2, 4): - raise ValueError("Sample width must be one of: 1, 2 or 4 (bytes)") - - if channels != 1: - raise ValueError("Only mono audio is currently handled") - - self.sampling_rate = sampling_rate - self.sample_width = sample_width - self.channels = channels - + + Parameters + ---------- + sampling_rate : int + number of samples per second of audio data. + sample_width : int + size in bytes of one audio sample. Possible values: 1, 2 or 4. + channels : int + number of channels of audio data. + """ + + def __init__( + self, sampling_rate, sample_width, channels, + ): + + if sample_width not in (1, 2, 4): + raise AudioParameterError( + "Sample width must be one of: 1, 2 or 4 (bytes)" + ) + + self._sampling_rate = sampling_rate + self._sample_width = sample_width + self._channels = channels + @abstractmethod def is_open(self): - """ Return True if audio source is open, False otherwise """ - + """Return True if audio source is open, False otherwise.""" + @abstractmethod def open(self): - """ Open audio source """ - + """Open audio source.""" + @abstractmethod def close(self): - """ Close audio source """ - + """Close audio source.""" + @abstractmethod def read(self, size): """ Read and return `size` audio samples at most. - - :Parameters: - - `size` : int - the number of samples to read. - - :Returns: - - Audio data as a string of length 'N' * 'smaple_width' * 'channels', where 'N' is: - - - `size` if `size` < 'left_samples' - - - 'left_samples' if `size` > 'left_samples' - - """ - - def get_sampling_rate(self): - """ Return the number of samples per second of audio stream """ - return self.sampling_rate - - def get_sample_width(self): - """ Return the number of bytes used to represent one audio sample """ - return self.sample_width - - def get_channels(self): - """ Return the number of channels of this audio source """ + + Parameters + ----------- + size : int + Number of samples to read. + + Returns + ------- + data : bytes + Audio data as a bytes object of length `N * sample_width * channels` + where `N` equals: + + - `size` if `size` <= remaining samples + + - remaining samples if `size` > remaining samples + """ + + @property + def sampling_rate(self): + """Number of samples per second of audio stream.""" + return self._sampling_rate + + @property + def sr(self): + """Number of samples per second of audio stream (alias for + `sampling_rate)`.""" + return self._sampling_rate + + @property + def sample_width(self): + """Number of bytes used to represent one audio sample.""" + return self._sample_width + + @property + def sw(self): + """Number of bytes used to represent one audio sample (alias for + `sample_width`).""" + return self._sample_width + + @property + def channels(self): + """Number of channels in audio stream.""" + return self._channels + + @property + def ch(self): + """Number of channels in audio stream (alias for `channels`).""" return self.channels - -class Rewindable(): +class Rewindable(AudioSource): """ Base class for rewindable audio streams. - Subclasses should implement methods to return to the beginning of an - audio stream as well as method to move to an absolute audio position - expressed in time or in number of samples. + + Subclasses should implement a method to return back to the start of an the + stream (`rewind`), as well as a property getter/setter named `position` that + reads/sets stream position expressed in number of samples. """ - - __metaclass__ = ABCMeta - + @abstractmethod def rewind(self): - """ Go back to the beginning of audio stream """ - pass - - @abstractmethod - def get_position(self): - """ Return the total number of already read samples """ - - @abstractmethod - def get_time_position(self): - """ Return the total duration in seconds of already read data """ - + """Go back to the beginning of audio stream.""" + + @property @abstractmethod - def set_position(self, position): - """ Move to an absolute position - - :Parameters: - - `position` : int - number of samples to skip from the start of the stream - """ - + def position(self): + """Return stream position in number of samples.""" + + @position.setter @abstractmethod - def set_time_position(self, time_position): - """ Move to an absolute position expressed in seconds - - :Parameters: - - `time_position` : float - seconds to skip from the start of the stream - """ - pass + def position(self, position): + """Set stream position in number of samples.""" + + @property + def position_s(self): + """Return stream position in seconds.""" + return self.position / self.sampling_rate - + @position_s.setter + def position_s(self, position_s): + """Set stream position in seconds.""" + self.position = int(self.sampling_rate * position_s) -class BufferAudioSource(AudioSource, Rewindable): + @property + def position_ms(self): + """Return stream position in milliseconds.""" + return (self.position * 1000) // self.sampling_rate + + @position_ms.setter + def position_ms(self, position_ms): + """Set stream position in milliseconds.""" + if not isinstance(position_ms, int): + raise ValueError("position_ms should be an int") + self.position = int(self.sampling_rate * position_ms / 1000) + + +class BufferAudioSource(Rewindable): """ - An :class:`AudioSource` that encapsulates and reads data from a memory buffer. - It implements methods from :class:`Rewindable` and is therefore a navigable :class:`AudioSource`. + An `AudioSource` that encapsulates and reads data from a memory buffer. + + This class implements the `Rewindable` interface. + Parameters + ---------- + data : bytes + audio data + sampling_rate : int, default: 16000 + number of samples per second of audio data. + sample_width : int, default: 2 + size in bytes of one audio sample. Possible values: 1, 2 or 4. + channels : int, default: 1 + number of channels of audio data. """ - - def __init__(self, data_buffer, - sampling_rate = DEFAULT_SAMPLE_RATE, - sample_width = DEFAULT_SAMPLE_WIDTH, - channels = DEFAULT_NB_CHANNELS): - - if len(data_buffer) % (sample_width * channels) !=0: - raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)") - + + def __init__( + self, data, sampling_rate=16000, sample_width=2, channels=1, + ): AudioSource.__init__(self, sampling_rate, sample_width, channels) - self._buffer = data_buffer - self._index = 0 - self._left = 0 if self._buffer is None else len(self._buffer) + check_audio_data(data, sample_width, channels) + self._data = data + self._sample_size_all_channels = sample_width * channels + self._current_position_bytes = 0 self._is_open = False - + def is_open(self): return self._is_open - + def open(self): self._is_open = True - + def close(self): self._is_open = False self.rewind() - + def read(self, size): if not self._is_open: - raise IOError("Stream is not open") - - if self._left > 0: - - to_read = size * self.sample_width * self.channels - if to_read > self._left: - to_read = self._left - - data = self._buffer[self._index: self._index + to_read] - self._index += to_read - self._left -= to_read - + raise AudioIOError("Stream is not open") + if size is None or size < 0: + offset = None + else: + bytes_to_read = self._sample_size_all_channels * size + offset = self._current_position_bytes + bytes_to_read + data = self._data[self._current_position_bytes : offset] + if data: + self._current_position_bytes += len(data) return data - return None - - def get_data_buffer(self): - """ Return all audio data as one string buffer. """ - return self._buffer - - def set_data(self, data_buffer): - """ Set new data for this audio stream. - - :Parameters: - - `data_buffer` : str, basestring, Bytes - a string buffer with a length multiple of (sample_width * channels) - """ - if len(data_buffer) % (self.sample_width * self.channels) !=0: - raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)") - self._buffer = data_buffer - self._index = 0 - self._left = 0 if self._buffer is None else len(self._buffer) - - def append_data(self, data_buffer): - """ Append data to this audio stream - - :Parameters: - - `data_buffer` : str, basestring, Bytes - a buffer with a length multiple of (sample_width * channels) - """ - - if len(data_buffer) % (self.sample_width * self.channels) !=0: - raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)") - - self._buffer += data_buffer - self._left += len(data_buffer) - - + + @property + def data(self): + """Get raw audio data as a `bytes` object.""" + return self._data + def rewind(self): - self.set_position(0) - - def get_position(self): - return self._index / self.sample_width - - def get_time_position(self): - return float(self._index) / (self.sample_width * self.sampling_rate) - - def set_position(self, position): - if position < 0: - raise ValueError("position must be >= 0") - - if self._buffer is None: - self._index = 0 - self._left = 0 - return - - position *= self.sample_width - self._index = position if position < len(self._buffer) else len(self._buffer) - self._left = len(self._buffer) - self._index + self.position = 0 + + @property + def position(self): + """Get stream position in number of samples""" + return self._current_position_bytes // self._sample_size_all_channels + @position.setter + def position(self, position): + """Set stream position in number of samples.""" + position *= self._sample_size_all_channels + if position < 0: + position += len(self.data) + if position < 0 or position > len(self.data): + raise IndexError("Position out of range") + self._current_position_bytes = position - def set_time_position(self, time_position): # time in seconds - position = int(self.sampling_rate * time_position) - self.set_position(position) + @property + def position_ms(self): + """Get stream position in milliseconds.""" + return (self._current_position_bytes * 1000) // ( + self._sample_size_all_channels * self.sampling_rate + ) + @position_ms.setter + def position_ms(self, position_ms): + """Set stream position in milliseconds.""" + if not isinstance(position_ms, int): + raise ValueError("position_ms should be an int") + self.position = int(self.sampling_rate * position_ms / 1000) -class WaveAudioSource(AudioSource): +class FileAudioSource(AudioSource): """ - A class for an `AudioSource` that reads data from a wave file. - - :Parameters: - - `filename` : - path to a valid wave file + Base class `AudioSource`s that read audio data from a file. + + Parameters + ---------- + sampling_rate : int, default: 16000 + number of samples per second of audio data. + sample_width : int, default: 2 + size in bytes of one audio sample. Possible values: 1, 2 or 4. + channels : int, default: 1 + number of channels of audio data. """ - - def __init__(self, filename): - - self._filename = filename + + def __init__(self, sampling_rate, sample_width, channels): + AudioSource.__init__(self, sampling_rate, sample_width, channels) self._audio_stream = None - - stream = wave.open(self._filename) - AudioSource.__init__(self, stream.getframerate(), - stream.getsampwidth(), - stream.getnchannels()) - stream.close() - - + + def __del__(self): + if self.is_open(): + self.close() + def is_open(self): return self._audio_stream is not None - - def open(self): - if(self._audio_stream is None): - self._audio_stream = wave.open(self._filename) - - + def close(self): if self._audio_stream is not None: self._audio_stream.close() self._audio_stream = None - - + + @abstractmethod + def _read_from_stream(self, size): + """Read data from stream""" + def read(self, size): + if not self.is_open(): + raise AudioIOError("Audio stream is not open") + data = self._read_from_stream(size) + if not data: + return None + return data + + +class RawAudioSource(FileAudioSource): + """ + A class for an `AudioSource` that reads data from a raw (headerless) audio + file. + + This class should be used for large raw audio files to avoid loading the + whole data to memory. + + Parameters + ---------- + filename : str + path to a raw audio file. + sampling_rate : int + Number of samples per second of audio data. + sample_width : int + Size in bytes of one audio sample. Possible values : 1, 2, 4. + channels : int + Number of channels of audio data. + """ + + def __init__(self, file, sampling_rate, sample_width, channels): + FileAudioSource.__init__(self, sampling_rate, sample_width, channels) + self._file = file + self._audio_stream = None + self._sample_size = sample_width * channels + + def open(self): if self._audio_stream is None: - raise IOError("Stream is not open") + self._audio_stream = open(self._file, "rb") + + def _read_from_stream(self, size): + if size is None or size < 0: + bytes_to_read = None else: - data = self._audio_stream.readframes(size) - if data is None or len(data) < 1: - return None - return data + bytes_to_read = size * self._sample_size + data = self._audio_stream.read(bytes_to_read) + return data + + +class WaveAudioSource(FileAudioSource): + """ + A class for an `AudioSource` that reads data from a wave file. + + This class should be used for large wave files to avoid loading the whole + data to memory. + + Parameters + ---------- + filename : str + path to a valid wave file. + """ + + def __init__(self, filename): + self._filename = filename + self._audio_stream = None + stream = wave.open(self._filename, "rb") + FileAudioSource.__init__( + self, + stream.getframerate(), + stream.getsampwidth(), + stream.getnchannels(), + ) + stream.close() + + def open(self): + if self._audio_stream is None: + self._audio_stream = wave.open(self._filename) + + def _read_from_stream(self, size): + if size is None or size < 0: + size = -1 + return self._audio_stream.readframes(size) class PyAudioSource(AudioSource): """ - A class for an `AudioSource` that reads data the built-in microphone using PyAudio. + A class for an `AudioSource` that reads data from built-in microphone using + PyAudio (https://people.csail.mit.edu/hubert/pyaudio/). + + Parameters + ---------- + sampling_rate : int, default: 16000 + number of samples per second of audio data. + sample_width : int, default: 2 + size in bytes of one audio sample. Possible values: 1, 2 or 4. + channels : int, default: 1 + number of channels of audio data. + frames_per_buffer : int, default: 1024 + PyAudio number of frames per buffer. + input_device_index: None or int, default: None + PyAudio index of audio device to read audio data from. If None default + device is used. """ - - def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE, - sample_width = DEFAULT_SAMPLE_WIDTH, - channels = DEFAULT_NB_CHANNELS, - frames_per_buffer = 1024): - - + + def __init__( + self, + sampling_rate=16000, + sample_width=2, + channels=1, + frames_per_buffer=1024, + input_device_index=None, + ): + AudioSource.__init__(self, sampling_rate, sample_width, channels) self._chunk_size = frames_per_buffer - + self.input_device_index = input_device_index + import pyaudio + self._pyaudio_object = pyaudio.PyAudio() - self._pyaudio_format = self._pyaudio_object.get_format_from_width(self.sample_width) + self._pyaudio_format = self._pyaudio_object.get_format_from_width( + self.sample_width + ) self._audio_stream = None - def is_open(self): return self._audio_stream is not None - + def open(self): - self._audio_stream = self._pyaudio_object.open(format = self._pyaudio_format, - channels = self.channels, - rate = self.sampling_rate, - input = True, - output = False, - frames_per_buffer = self._chunk_size) - - + self._audio_stream = self._pyaudio_object.open( + format=self._pyaudio_format, + channels=self.channels, + rate=self.sampling_rate, + input=True, + output=False, + input_device_index=self.input_device_index, + frames_per_buffer=self._chunk_size, + ) + def close(self): if self._audio_stream is not None: self._audio_stream.stop_stream() self._audio_stream.close() self._audio_stream = None - - + def read(self, size): if self._audio_stream is None: raise IOError("Stream is not open") - if self._audio_stream.is_active(): data = self._audio_stream.read(size) if data is None or len(data) < 1: return None return data - return None - -class StdinAudioSource(AudioSource): + +class StdinAudioSource(FileAudioSource): """ - A class for an :class:`AudioSource` that reads data from standard input. + A class for an `AudioSource` that reads data from standard input. + + Parameters + ---------- + sampling_rate : int, default: 16000 + number of samples per second of audio data. + sample_width : int, default: 2 + size in bytes of one audio sample. Possible values: 1, 2 or 4. + channels : int, default: 1 + number of channels of audio data. """ - - def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE, - sample_width = DEFAULT_SAMPLE_WIDTH, - channels = DEFAULT_NB_CHANNELS): - - AudioSource.__init__(self, sampling_rate, sample_width, channels) + + def __init__( + self, sampling_rate=16000, sample_width=2, channels=1, + ): + FileAudioSource.__init__(self, sampling_rate, sample_width, channels) self._is_open = False - - + self._sample_size = sample_width * channels + self._stream = sys.stdin.buffer + def is_open(self): return self._is_open - + def open(self): self._is_open = True - + def close(self): self._is_open = False - - def read(self, size): - if not self._is_open: - raise IOError("Stream is not open") - - to_read = size * self.sample_width * self.channels - data = sys.stdin.read(to_read) - - if data is None or len(data) < 1: - return None - - return data - - -class PyAudioPlayer(): + + def _read_from_stream(self, size): + bytes_to_read = size * self._sample_size + data = self._stream.read(bytes_to_read) + if data: + return data + return None + + +def _make_tqdm_progress_bar(iterable, total, duration, **tqdm_kwargs): + fmt = tqdm_kwargs.get("bar_format", DEFAULT_BAR_FORMAT_TQDM) + fmt = fmt.replace("{duration}", "{:.3f}".format(duration)) + tqdm_kwargs["bar_format"] = fmt + + tqdm_kwargs["ncols"] = tqdm_kwargs.get("ncols", DEFAULT_NCOLS_TQDM) + tqdm_kwargs["mininterval"] = tqdm_kwargs.get( + "mininterval", DEFAULT_MIN_INTERVAL_TQDM + ) + return _tqdm(iterable, total=total, **tqdm_kwargs) + + +class PyAudioPlayer: """ A class for audio playback using Pyaudio + (https://people.csail.mit.edu/hubert/pyaudio/). + + Parameters + ---------- + sampling_rate : int, default: 16000 + number of samples per second of audio data. + sample_width : int, default: 2 + size in bytes of one audio sample. Possible values: 1, 2 or 4. + channels : int, default: 1 + number of channels of audio data. """ - - def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE, - sample_width = DEFAULT_SAMPLE_WIDTH, - channels = DEFAULT_NB_CHANNELS): - if not sample_width in (1, 2, 4): - raise ValueError("Sample width must be one of: 1, 2 or 4 (bytes)") - + + def __init__( + self, sampling_rate=16000, sample_width=2, channels=1, + ): + if sample_width not in (1, 2, 4): + raise ValueError("Sample width in bytes must be one of 1, 2 or 4") + self.sampling_rate = sampling_rate self.sample_width = sample_width self.channels = channels - + import pyaudio + self._p = pyaudio.PyAudio() - self.stream = self._p.open(format = self._p.get_format_from_width(self.sample_width), - channels = self.channels, rate = self.sampling_rate, - input = False, output = True) - - def play(self, data): + self.stream = self._p.open( + format=self._p.get_format_from_width(self.sample_width), + channels=self.channels, + rate=self.sampling_rate, + input=False, + output=True, + ) + + def play(self, data, progress_bar=False, **progress_bar_kwargs): + chunk_gen, nb_chunks = self._chunk_data(data) + if progress_bar and _WITH_TQDM: + duration = len(data) / ( + self.sampling_rate * self.sample_width * self.channels + ) + chunk_gen = _make_tqdm_progress_bar( + chunk_gen, + total=nb_chunks, + duration=duration, + **progress_bar_kwargs + ) if self.stream.is_stopped(): self.stream.start_stream() - - for chunk in self._chunk_data(data): - self.stream.write(chunk) - + try: + for chunk in chunk_gen: + self.stream.write(chunk) + except KeyboardInterrupt: + pass self.stream.stop_stream() - - def stop(self): + + def stop(self): if not self.stream.is_stopped(): self.stream.stop_stream() self.stream.close() self._p.terminate() - + def _chunk_data(self, data): # make audio chunks of 100 ms to allow interruption (like ctrl+c) - chunk_size = int((self.sampling_rate * self.sample_width * self.channels) / 10) - start = 0 - while start < len(data): - yield data[start : start + chunk_size] - start += chunk_size - - -def from_file(filename): - """ - Create an `AudioSource` object using the audio file specified by `filename`. - The appropriate :class:`AudioSource` class is guessed from file's extension. - - :Parameters: - - `filename` : - path to an audio file. - - :Returns: - - an `AudioSource` object that reads data from the given file. - - """ - - if filename.lower().endswith(".wav"): - return WaveAudioSource(filename) - - raise Exception("Can not create an AudioSource object from '%s'" %(filename)) - - -def player_for(audio_source): - """ - Return a :class:`PyAudioPlayer` that can play data from `audio_source`. - - :Parameters: - - `audio_source` : - an `AudioSource` object. - - :Returns: - - `PyAudioPlayer` that has the same sampling rate, sample width and number of channels - as `audio_source`. - """ - - return PyAudioPlayer(audio_source.get_sampling_rate(), - audio_source.get_sample_width(), - audio_source.get_channels()) - - + bytes_1_sec = self.sampling_rate * self.sample_width * self.channels + chunk_size = bytes_1_sec // 10 + # make sure chunk_size is a multiple of sample_width * channels + chunk_size -= chunk_size % (self.sample_width * self.channels) + nb_chunks, rest = divmod(len(data), chunk_size) + if rest > 0: + nb_chunks += 1 + chunk_gen = ( + data[i : i + chunk_size] for i in range(0, len(data), chunk_size) + ) + return chunk_gen, nb_chunks + + +def player_for(source): + """ + Return an `AudioPlayer` compatible with `source` (i.e., has the same + sampling rate, sample width and number of channels). + + Parameters + ---------- + source : AudioSource + An object that has `sampling_rate`, `sample_width` and `sample_width` + attributes. + + Returns + ------- + player : PyAudioPlayer + An audio player that has the same sampling rate, sample width + and number of channels as `source`. + """ + return PyAudioPlayer( + source.sampling_rate, source.sample_width, source.channels + ) + + +def get_audio_source(input=None, **kwargs): + """ + Create and return an AudioSource from input. + + Parameters + ---------- + input : str, bytes, "-" or None (default) + source to read audio data from. If `str`, it should be a path to a valid + audio file. If `bytes`, it is used as raw audio data. If it is "-", + raw data will be read from stdin. If None, read audio data from the + microphone using PyAudio. + kwargs + audio parameters used to build the `AudioSource` object. Depending on + the nature of `input`, theses may be omitted (e.g., when `input` is an + audio file in a popular audio format such as wav, ogg, flac, etc.) or + include parameters such as `sampling_rate`, `sample_width`, `channels` + (or their respective short name versions `sr`, `sw` and `ch`) if `input` + is a path to a raw (headerless) audio file, a bytes object for raw audio + data or None (to read data from built-in microphone). See the respective + `AudioSource` classes from more information about possible parameters. + + Returns + ------- + source : AudioSource + audio source created from input parameters + """ + if input == "-": + return StdinAudioSource(*_get_audio_parameters(kwargs)) + if isinstance(input, bytes): + return BufferAudioSource(input, *_get_audio_parameters(kwargs)) + + # read data from a file + if input is not None: + return from_file(filename=input, **kwargs) + + # read data from microphone via pyaudio + else: + frames_per_buffer = kwargs.get("frames_per_buffer", 1024) + input_device_index = kwargs.get("input_device_index") + return PyAudioSource( + *_get_audio_parameters(kwargs), + frames_per_buffer=frames_per_buffer, + input_device_index=input_device_index + ) + + +def _load_raw(file, sampling_rate, sample_width, channels, large_file=False): + """ + Load a raw audio file with standard Python. If `large_file` is True, return + a `RawAudioSource` object that reads data lazily from disk, otherwise load + all data to memory and return a `BufferAudioSource` object. + + Parameters + ---------- + file : str + path to a raw audio data file. + sampling_rate : int + sampling rate of audio data. + sample_width : int + size in bytes of one audio sample. + channels : int + number of channels of audio data. + large_file : bool + if True, return a `RawAudioSource` otherwise a `BufferAudioSource` + object. + + Returns + ------- + source : RawAudioSource or BufferAudioSource + an `AudioSource` that reads data from input file. + """ + if None in (sampling_rate, sample_width, channels): + raise AudioParameterError( + "All audio parameters are required for raw audio files" + ) + + if large_file: + return RawAudioSource( + file, + sampling_rate=sampling_rate, + sample_width=sample_width, + channels=channels, + ) + + with open(file, "rb") as fp: + data = fp.read() + return BufferAudioSource( + data, + sampling_rate=sampling_rate, + sample_width=sample_width, + channels=channels, + ) + + +def _load_wave(file, large_file=False): + """ + Load a wave audio file with standard Python. If `large_file` is True, return + a `WaveAudioSource` object that reads data lazily from disk, otherwise load + all data to memory and return a `BufferAudioSource` object. + + Parameters + ---------- + file : str + path to a wav audio data file + large_file : bool + if True, return a `WaveAudioSource` otherwise a `BufferAudioSource` + object. + + Returns + ------- + source : WaveAudioSource or BufferAudioSource + an `AudioSource` that reads data from input file. + """ + if large_file: + return WaveAudioSource(file) + with wave.open(file) as fp: + channels = fp.getnchannels() + srate = fp.getframerate() + swidth = fp.getsampwidth() + data = fp.readframes(-1) + return BufferAudioSource( + data, sampling_rate=srate, sample_width=swidth, channels=channels + ) + + +def _load_with_pydub(file, audio_format=None): + """ + Open compressed audio or video file using pydub. If a video file + is passed, its audio track(s) are extracted and loaded. + + Parameters + ---------- + file : str + path to audio file. + audio_format : str, default: None + string, audio/video file format if known (e.g. raw, webm, wav, ogg) + + Returns + ------- + source : BufferAudioSource + an `AudioSource` that reads data from input file. + """ + func_dict = { + "mp3": AudioSegment.from_mp3, + "ogg": AudioSegment.from_ogg, + "flv": AudioSegment.from_flv, + } + open_function = func_dict.get(audio_format, AudioSegment.from_file) + segment = open_function(file) + return BufferAudioSource( + data=segment.raw_data, + sampling_rate=segment.frame_rate, + sample_width=segment.sample_width, + channels=segment.channels, + ) + + +def from_file(filename, audio_format=None, large_file=False, **kwargs): + """ + Read audio data from `filename` and return an `AudioSource` object. + if `audio_format` is None, the appropriate `AudioSource` class is guessed + from file's extension. `filename` can be a compressed audio or video file. + This will require installing `pydub` (https://github.com/jiaaro/pydub). + + The normal behavior is to load all audio data to memory from which a + :class:`BufferAudioSource` object is created. This should be convenient + most of the time unless audio file is very large. In that case, and + in order to load audio data in lazy manner (i.e. read data from disk each + time :func:`AudioSource.read` is called), `large_file` should be True. + + Note that the current implementation supports only wave and raw formats for + lazy audio loading. + + If an audio format is `raw`, the following keyword arguments are required: + + - `sampling_rate`, `sr`: int, sampling rate of audio data. + - `sample_width`, `sw`: int, size in bytes of one audio sample. + - `channels`, `ch`: int, number of channels of audio data. + + See also + -------- + :func:`to_file`. + + Parameters + ---------- + filename : str + path to input audio or video file. + audio_format : str + audio format used to save data (e.g. raw, webm, wav, ogg). + large_file : bool, default: False + if True, audio won't fully be loaded to memory but only when a window + is read from disk. + + + Other Parameters + ---------------- + sampling_rate, sr: int + sampling rate of audio data + sample_width : int + sample width (i.e. number of bytes used to represent one audio sample) + channels : int + number of channels of audio data + + Returns + ------- + audio_source : AudioSource + an :class:`AudioSource` object that reads data from input file. + + Raises + ------ + `AudioIOError` + raised if audio data cannot be read in the given + format or if `format` is `raw` and one or more audio parameters are missing. + """ + audio_format = _guess_audio_format(audio_format, filename) + + if audio_format == "raw": + srate, swidth, channels = _get_audio_parameters(kwargs) + return _load_raw(filename, srate, swidth, channels, large_file) + + if audio_format in ["wav", "wave"]: + return _load_wave(filename, large_file) + if large_file: + err_msg = "if 'large_file` is True file format should be raw or wav" + raise AudioIOError(err_msg) + if _WITH_PYDUB: + return _load_with_pydub(filename, audio_format=audio_format) + else: + raise AudioIOError( + "pydub is required for audio formats other than raw or wav" + ) + + +def _save_raw(data, file): + """ + Saves audio data as a headerless (i.e. raw) file. + See also :func:`to_file`. + """ + with open(file, "wb") as fp: + fp.write(data) + + +def _save_wave(data, file, sampling_rate, sample_width, channels): + """ + Saves audio data to a wave file. + See also :func:`to_file`. + """ + if None in (sampling_rate, sample_width, channels): + raise AudioParameterError( + "All audio parameters are required to save wave audio files" + ) + with wave.open(file, "w") as fp: + fp.setframerate(sampling_rate) + fp.setsampwidth(sample_width) + fp.setnchannels(channels) + fp.writeframes(data) + + +def _save_with_pydub( + data, file, audio_format, sampling_rate, sample_width, channels +): + """ + Saves audio data with pydub (https://github.com/jiaaro/pydub). + See also :func:`to_file`. + """ + segment = AudioSegment( + data, + frame_rate=sampling_rate, + sample_width=sample_width, + channels=channels, + ) + with open(file, "wb") as fp: + segment.export(fp, format=audio_format) + + +def to_file(data, file, audio_format=None, **kwargs): + """ + Writes audio data to file. If `audio_format` is `None`, output + audio format will be guessed from extension. If `audio_format` + is `None` and `file` comes without an extension then audio + data will be written as a raw audio file. + + Parameters + ---------- + data : bytes-like + audio data to be written. Can be a `bytes`, `bytearray`, + `memoryview`, `array` or `numpy.ndarray` object. + file : str + path to output audio file. + audio_format : str + audio format used to save data (e.g. raw, webm, wav, ogg) + kwargs: dict + If an audio format other than `raw` is used, the following keyword + arguments are required: + + - `sampling_rate`, `sr`: int, sampling rate of audio data. + - `sample_width`, `sw`: int, size in bytes of one audio sample. + - `channels`, `ch`: int, number of channels of audio data. + + Raises + ------ + `AudioParameterError` if output format is different than raw and one or more + audio parameters are missing. `AudioIOError` if audio data cannot be written + in the desired format. + """ + audio_format = _guess_audio_format(audio_format, file) + if audio_format in (None, "raw"): + _save_raw(data, file) + return + try: + sampling_rate, sample_width, channels = _get_audio_parameters(kwargs) + except AudioParameterError as exc: + err_message = "All audio parameters are required to save formats " + "other than raw. Error detail: {}".format(exc) + raise AudioParameterError(err_message) + if audio_format in ("wav", "wave"): + _save_wave(data, file, sampling_rate, sample_width, channels) + elif _WITH_PYDUB: + _save_with_pydub( + data, file, audio_format, sampling_rate, sample_width, channels + ) + else: + err_message = "cannot write file format {} (file name: {})" + raise AudioIOError(err_message.format(audio_format, file)) diff --git a/libs/auditok/plotting.py b/libs/auditok/plotting.py new file mode 100755 index 000000000..eca5877f4 --- /dev/null +++ b/libs/auditok/plotting.py @@ -0,0 +1,150 @@ +import matplotlib.pyplot as plt +import numpy as np + +AUDITOK_PLOT_THEME = { + "figure": {"facecolor": "#482a36", "alpha": 0.2}, + "plot": {"facecolor": "#282a36"}, + "energy_threshold": { + "color": "#e31f8f", + "linestyle": "--", + "linewidth": 1, + }, + "signal": {"color": "#40d970", "linestyle": "-", "linewidth": 1}, + "detections": { + "facecolor": "#777777", + "edgecolor": "#ff8c1a", + "linewidth": 1, + "alpha": 0.75, + }, +} + + +def _make_time_axis(nb_samples, sampling_rate): + sample_duration = 1 / sampling_rate + x = np.linspace(0, sample_duration * (nb_samples - 1), nb_samples) + return x + + +def _plot_line(x, y, theme, xlabel=None, ylabel=None, **kwargs): + color = theme.get("color", theme.get("c")) + ls = theme.get("linestyle", theme.get("ls")) + lw = theme.get("linewidth", theme.get("lw")) + plt.plot(x, y, c=color, ls=ls, lw=lw, **kwargs) + plt.xlabel(xlabel, fontsize=8) + plt.ylabel(ylabel, fontsize=8) + + +def _plot_detections(subplot, detections, theme): + fc = theme.get("facecolor", theme.get("fc")) + ec = theme.get("edgecolor", theme.get("ec")) + ls = theme.get("linestyle", theme.get("ls")) + lw = theme.get("linewidth", theme.get("lw")) + alpha = theme.get("alpha") + for (start, end) in detections: + subplot.axvspan(start, end, fc=fc, ec=ec, ls=ls, lw=lw, alpha=alpha) + + +def plot( + audio_region, + scale_signal=True, + detections=None, + energy_threshold=None, + show=True, + figsize=None, + save_as=None, + dpi=120, + theme="auditok", +): + y = np.asarray(audio_region) + if len(y.shape) == 1: + y = y.reshape(1, -1) + nb_subplots, nb_samples = y.shape + sampling_rate = audio_region.sampling_rate + time_axis = _make_time_axis(nb_samples, sampling_rate) + if energy_threshold is not None: + eth_log10 = energy_threshold * np.log(10) / 10 + amplitude_threshold = np.sqrt(np.exp(eth_log10)) + else: + amplitude_threshold = None + if detections is None: + detections = [] + else: + # End of detection corresponds to the end of the last sample but + # to stay compatible with the time axis of signal plotting we want end + # of detection to correspond to the *start* of the that last sample. + detections = [ + (start, end - (1 / sampling_rate)) for (start, end) in detections + ] + if theme == "auditok": + theme = AUDITOK_PLOT_THEME + + fig = plt.figure(figsize=figsize, dpi=dpi) + fig_theme = theme.get("figure", theme.get("fig", {})) + fig_fc = fig_theme.get("facecolor", fig_theme.get("ffc")) + fig_alpha = fig_theme.get("alpha", 1) + fig.patch.set_facecolor(fig_fc) + fig.patch.set_alpha(fig_alpha) + + plot_theme = theme.get("plot", {}) + plot_fc = plot_theme.get("facecolor", plot_theme.get("pfc")) + + if nb_subplots > 2 and nb_subplots % 2 == 0: + nb_rows = nb_subplots // 2 + nb_columns = 2 + else: + nb_rows = nb_subplots + nb_columns = 1 + + for sid, samples in enumerate(y, 1): + ax = fig.add_subplot(nb_rows, nb_columns, sid) + ax.set_facecolor(plot_fc) + if scale_signal: + std = samples.std() + if std > 0: + mean = samples.mean() + std = samples.std() + samples = (samples - mean) / std + max_ = samples.max() + plt.ylim(-1.5 * max_, 1.5 * max_) + if amplitude_threshold is not None: + if scale_signal and std > 0: + amp_th = (amplitude_threshold - mean) / std + else: + amp_th = amplitude_threshold + eth_theme = theme.get("energy_threshold", theme.get("eth", {})) + _plot_line( + [time_axis[0], time_axis[-1]], + [amp_th] * 2, + eth_theme, + label="Detection threshold", + ) + if sid == 1: + legend = plt.legend( + ["Detection threshold"], + facecolor=fig_fc, + framealpha=0.1, + bbox_to_anchor=(0.0, 1.15, 1.0, 0.102), + loc=2, + ) + legend = plt.gca().add_artist(legend) + + signal_theme = theme.get("signal", {}) + _plot_line( + time_axis, + samples, + signal_theme, + xlabel="Time (seconds)", + ylabel="Signal{}".format(" (scaled)" if scale_signal else ""), + ) + detections_theme = theme.get("detections", {}) + _plot_detections(ax, detections, detections_theme) + plt.title("Channel {}".format(sid), fontsize=10) + + plt.xticks(fontsize=8) + plt.yticks(fontsize=8) + plt.tight_layout() + + if save_as is not None: + plt.savefig(save_as, dpi=dpi) + if show: + plt.show() diff --git a/libs/auditok/signal.py b/libs/auditok/signal.py new file mode 100644 index 000000000..3f00fb9e5 --- /dev/null +++ b/libs/auditok/signal.py @@ -0,0 +1,179 @@ +""" +Module for basic audio signal processing and array operations. + +.. autosummary:: + :toctree: generated/ + + to_array + extract_single_channel + compute_average_channel + compute_average_channel_stereo + separate_channels + calculate_energy_single_channel + calculate_energy_multichannel +""" +from array import array as array_ +import audioop +import math + +FORMAT = {1: "b", 2: "h", 4: "i"} +_EPSILON = 1e-10 + + +def to_array(data, sample_width, channels): + """Extract individual channels of audio data and return a list of arrays of + numeric samples. This will always return a list of `array.array` objects + (one per channel) even if audio data is mono. + + Parameters + ---------- + data : bytes + raw audio data. + sample_width : int + size in bytes of one audio sample (one channel considered). + + Returns + ------- + samples_arrays : list + list of arrays of audio samples. + """ + fmt = FORMAT[sample_width] + if channels == 1: + return [array_(fmt, data)] + return separate_channels(data, fmt, channels) + + +def extract_single_channel(data, fmt, channels, selected): + samples = array_(fmt, data) + return samples[selected::channels] + + +def compute_average_channel(data, fmt, channels): + """ + Compute and return average channel of multi-channel audio data. If the + number of channels is 2, use :func:`compute_average_channel_stereo` (much + faster). This function uses satandard `array` module to convert `bytes` data + into an array of numeric values. + + Parameters + ---------- + data : bytes + multi-channel audio data to mix down. + fmt : str + format (single character) to pass to `array.array` to convert `data` + into an array of samples. This should be "b" if audio data's sample width + is 1, "h" if it's 2 and "i" if it's 4. + channels : int + number of channels of audio data. + + Returns + ------- + mono_audio : bytes + mixed down audio data. + """ + all_channels = array_(fmt, data) + mono_channels = [ + array_(fmt, all_channels[ch::channels]) for ch in range(channels) + ] + avg_arr = array_( + fmt, + (round(sum(samples) / channels) for samples in zip(*mono_channels)), + ) + return avg_arr + + +def compute_average_channel_stereo(data, sample_width): + """Compute and return average channel of stereo audio data. This function + should be used when the number of channels is exactly 2 because in that + case we can use standard `audioop` module which *much* faster then calling + :func:`compute_average_channel`. + + Parameters + ---------- + data : bytes + 2-channel audio data to mix down. + sample_width : int + size in bytes of one audio sample (one channel considered). + + Returns + ------- + mono_audio : bytes + mixed down audio data. + """ + fmt = FORMAT[sample_width] + arr = array_(fmt, audioop.tomono(data, sample_width, 0.5, 0.5)) + return arr + + +def separate_channels(data, fmt, channels): + """Create a list of arrays of audio samples (`array.array` objects), one for + each channel. + + Parameters + ---------- + data : bytes + multi-channel audio data to mix down. + fmt : str + format (single character) to pass to `array.array` to convert `data` + into an array of samples. This should be "b" if audio data's sample width + is 1, "h" if it's 2 and "i" if it's 4. + channels : int + number of channels of audio data. + + Returns + ------- + channels_arr : list + list of audio channels, each as a standard `array.array`. + """ + all_channels = array_(fmt, data) + mono_channels = [ + array_(fmt, all_channels[ch::channels]) for ch in range(channels) + ] + return mono_channels + + +def calculate_energy_single_channel(data, sample_width): + """Calculate the energy of mono audio data. Energy is computed as: + + .. math:: energy = 20 \log(\sqrt({1}/{N}\sum_{i}^{N}{a_i}^2)) % # noqa: W605 + + where `a_i` is the i-th audio sample and `N` is the number of audio samples + in data. + + Parameters + ---------- + data : bytes + single-channel audio data. + sample_width : int + size in bytes of one audio sample. + + Returns + ------- + energy : float + energy of audio signal. + """ + energy_sqrt = max(audioop.rms(data, sample_width), _EPSILON) + return 20 * math.log10(energy_sqrt) + + +def calculate_energy_multichannel(x, sample_width, aggregation_fn=max): + """Calculate the energy of multi-channel audio data. Energy is calculated + channel-wise. An aggregation function is applied to the resulting energies + (default: `max`). Also see :func:`calculate_energy_single_channel`. + + Parameters + ---------- + data : bytes + single-channel audio data. + sample_width : int + size in bytes of one audio sample (one channel considered). + aggregation_fn : callable, default: max + aggregation function to apply to the resulting per-channel energies. + + Returns + ------- + energy : float + aggregated energy of multi-channel audio signal. + """ + energies = (calculate_energy_single_channel(xi, sample_width) for xi in x) + return aggregation_fn(energies) diff --git a/libs/auditok/signal_numpy.py b/libs/auditok/signal_numpy.py new file mode 100644 index 000000000..bf5425197 --- /dev/null +++ b/libs/auditok/signal_numpy.py @@ -0,0 +1,30 @@ +import numpy as np +from .signal import ( + compute_average_channel_stereo, + calculate_energy_single_channel, + calculate_energy_multichannel, +) + +FORMAT = {1: np.int8, 2: np.int16, 4: np.int32} + + +def to_array(data, sample_width, channels): + fmt = FORMAT[sample_width] + if channels == 1: + return np.frombuffer(data, dtype=fmt).astype(np.float64) + return separate_channels(data, fmt, channels).astype(np.float64) + + +def extract_single_channel(data, fmt, channels, selected): + samples = np.frombuffer(data, dtype=fmt) + return np.asanyarray(samples[selected::channels], order="C") + + +def compute_average_channel(data, fmt, channels): + array = np.frombuffer(data, dtype=fmt).astype(np.float64) + return array.reshape(-1, channels).mean(axis=1).round().astype(fmt) + + +def separate_channels(data, fmt, channels): + array = np.frombuffer(data, dtype=fmt) + return np.asanyarray(array.reshape(-1, channels).T, order="C") diff --git a/libs/auditok/util.py b/libs/auditok/util.py index d46a8899c..f29eb9bf3 100644 --- a/libs/auditok/util.py +++ b/libs/auditok/util.py @@ -1,448 +1,624 @@ """ -Class summary -============= - .. autosummary:: + :toctree: generated/ - DataSource - StringDataSource - ADSFactory - ADSFactory.AudioDataSource - ADSFactory.ADSDecorator - ADSFactory.OverlapADS - ADSFactory.LimiterADS - ADSFactory.RecorderADS - DataValidator - AudioEnergyValidator - + AudioEnergyValidator + AudioReader + Recorder + make_duration_formatter + make_channel_selector """ +from abc import ABC, abstractmethod +import warnings +from functools import partial +from .io import ( + AudioIOError, + AudioSource, + from_file, + BufferAudioSource, + PyAudioSource, + get_audio_source, +) +from .exceptions import ( + DuplicateArgument, + TooSamllBlockDuration, + TimeFormatError, +) +try: + from . import signal_numpy as signal +except ImportError: + from . import signal -from abc import ABCMeta, abstractmethod -import math -from array import array -from .io import Rewindable, from_file, BufferAudioSource, PyAudioSource -from .exceptions import DuplicateArgument -import sys +__all__ = [ + "make_duration_formatter", + "make_channel_selector", + "DataSource", + "DataValidator", + "StringDataSource", + "ADSFactory", + "AudioDataSource", + "AudioReader", + "Recorder", + "AudioEnergyValidator", +] -try: - import numpy - _WITH_NUMPY = True -except ImportError as e: - _WITH_NUMPY = False - -try: - from builtins import str - basestring = str -except ImportError as e: - if sys.version_info >= (3, 0): - basestring = str - - - -__all__ = ["DataSource", "DataValidator", "StringDataSource", "ADSFactory", "AudioEnergyValidator"] - - -class DataSource(): + +def make_duration_formatter(fmt): + """ + Make and return a function used to format durations in seconds. Accepted + format directives are: + + - ``%S`` : absolute number of seconds with 3 decimals. This direction should + be used alone. + - ``%i`` : milliseconds + - ``%s`` : seconds + - ``%m`` : minutes + - ``%h`` : hours + + These last 4 directives should all be specified. They can be placed anywhere + in the input string. + + Parameters + ---------- + fmt : str + duration format. + + Returns + ------- + formatter : callable + a function that takes a duration in seconds (float) and returns a string + that corresponds to that duration. + + Raises + ------ + TimeFormatError + if the format contains an unknown directive. + + Examples + -------- + + Using ``%S``: + + .. code:: python + + formatter = make_duration_formatter("%S") + formatter(123.589) + '123.589' + formatter(123) + '123.000' + + Using the other directives: + + .. code:: python + + formatter = make_duration_formatter("%h:%m:%s.%i") + formatter(3600+120+3.25) + '01:02:03.250' + + formatter = make_duration_formatter("%h hrs, %m min, %s sec and %i ms") + formatter(3600+120+3.25) + '01 hrs, 02 min, 03 sec and 250 ms' + + # omitting one of the 4 directives might result in a wrong duration + formatter = make_duration_formatter("%m min, %s sec and %i ms") + formatter(3600+120+3.25) + '02 min, 03 sec and 250 ms' + """ + if fmt == "%S": + + def fromatter(seconds): + return "{:.3f}".format(seconds) + + elif fmt == "%I": + + def fromatter(seconds): + return "{0}".format(int(seconds * 1000)) + + else: + fmt = fmt.replace("%h", "{hrs:02d}") + fmt = fmt.replace("%m", "{mins:02d}") + fmt = fmt.replace("%s", "{secs:02d}") + fmt = fmt.replace("%i", "{millis:03d}") + try: + i = fmt.index("%") + raise TimeFormatError( + "Unknown time format directive '{0}'".format(fmt[i : i + 2]) + ) + except ValueError: + pass + + def fromatter(seconds): + millis = int(seconds * 1000) + hrs, millis = divmod(millis, 3600000) + mins, millis = divmod(millis, 60000) + secs, millis = divmod(millis, 1000) + return fmt.format(hrs=hrs, mins=mins, secs=secs, millis=millis) + + return fromatter + + +def make_channel_selector(sample_width, channels, selected=None): + """Create and return a callable used for audio channel selection. The + returned selector can be used as `selector(audio_data)` and returns data + that contains selected channel only. + + Importantly, if `selected` is None or equals "any", `selector(audio_data)` + will separate and return a list of available channels: + `[data_channe_1, data_channe_2, ...].` + + Note also that returned `selector` expects `bytes` format for input data but + does notnecessarily return a `bytes` object. In fact, in order to extract + the desired channel (or compute the average channel if `selected` = "avg"), + it first converts input data into a `array.array` (or `numpy.ndarray`) + object. After channel of interst is selected/computed, it is returned as + such, without any reconversion to `bytes`. This behavior is wanted for + efficiency purposes because returned objects can be directly used as buffers + of bytes. In any case, returned objects can be converted back to `bytes` + using `bytes(obj)`. + + Exception to this is the special case where `channels` = 1 in which input + data is returned without any processing. + + + Parameters + ---------- + sample_width : int + number of bytes used to encode one audio sample, should be 1, 2 or 4. + channels : int + number of channels of raw audio data that the returned selector should + expect. + selected : int or str, default: None + audio channel to select and return when calling `selector(raw_data)`. It + should be an int >= `-channels` and < `channels`. If one of "mix", + "avg" or "average" is passed then `selector` will return the average + channel of audio data. If None or "any", return a list of all available + channels at each call. + + Returns + ------- + selector : callable + a callable that can be used as `selector(audio_data)` and returns data + that contains channel of interst. + + Raises + ------ + ValueError + if `sample_width` is not one of 1, 2 or 4, or if `selected` has an + unexpected value. """ - Base class for objects passed to :func:`auditok.core.StreamTokenizer.tokenize`. + fmt = signal.FORMAT.get(sample_width) + if fmt is None: + err_msg = "'sample_width' must be 1, 2 or 4, given: {}" + raise ValueError(err_msg.format(sample_width)) + if channels == 1: + return lambda x: x + + if isinstance(selected, int): + if selected < 0: + selected += channels + if selected < 0 or selected >= channels: + err_msg = "Selected channel must be >= -channels and < channels" + err_msg += ", given: {}" + raise ValueError(err_msg.format(selected)) + return partial( + signal.extract_single_channel, + fmt=fmt, + channels=channels, + selected=selected, + ) + + if selected in ("mix", "avg", "average"): + if channels == 2: + # when data is stereo, using audioop when possible is much faster + return partial( + signal.compute_average_channel_stereo, + sample_width=sample_width, + ) + + return partial( + signal.compute_average_channel, fmt=fmt, channels=channels + ) + + if selected in (None, "any"): + return partial(signal.separate_channels, fmt=fmt, channels=channels) + + raise ValueError( + "Selected channel must be an integer, None (alias 'any') or 'average' " + "(alias 'avg' or 'mix')" + ) + + +class DataSource(ABC): + """ + Base class for objects passed to :func:`StreamTokenizer.tokenize`. Subclasses should implement a :func:`DataSource.read` method. """ - __metaclass__ = ABCMeta - + @abstractmethod def read(self): """ - Read a piece of data read from this source. + Read a block (i.e., window) of data read from this source. If no more data is available, return None. """ - - -class DataValidator(): + + +class DataValidator(ABC): """ - Base class for a validator object used by :class:`.core.StreamTokenizer` to check - if read data is valid. + Base class for a validator object used by :class:`.core.StreamTokenizer` + to check if read data is valid. Subclasses should implement :func:`is_valid` method. """ - __metaclass__ = ABCMeta - + @abstractmethod def is_valid(self, data): """ Check whether `data` is valid """ + +class AudioEnergyValidator(DataValidator): + """ + A validator based on audio signal energy. For an input window of `N` audio + samples (see :func:`AudioEnergyValidator.is_valid`), the energy is computed + as: + + .. math:: energy = 20 \log(\sqrt({1}/{N}\sum_{i}^{N}{a_i}^2)) % # noqa: W605 + + where `a_i` is the i-th audio sample. + + Parameters + ---------- + energy_threshold : float + minimum energy that audio window should have to be valid. + sample_width : int + size in bytes of one audio sample. + channels : int + number of channels of audio data. + use_channel : {None, "any", "mix", "avg", "average"} or int + channel to use for energy computation. The following values are + accepted: + + - None (alias "any") : compute energy for each of the channels and return + the maximum value. + - "mix" (alias "avg" or "average") : compute the average channel then + compute its energy. + - int (>= 0 , < `channels`) : compute the energy of the specified channel + and ignore the other ones. + + Returns + ------- + energy : float + energy of the audio window. + """ + + def __init__( + self, energy_threshold, sample_width, channels, use_channel=None + ): + self._sample_width = sample_width + self._selector = make_channel_selector( + sample_width, channels, use_channel + ) + if channels == 1 or use_channel not in (None, "any"): + self._energy_fn = signal.calculate_energy_single_channel + else: + self._energy_fn = signal.calculate_energy_multichannel + self._energy_threshold = energy_threshold + + def is_valid(self, data): + """ + + Parameters + ---------- + data : bytes-like + array of raw audio data + + Returns + ------- + bool + True if the energy of audio data is >= threshold, False otherwise. + """ + log_energy = self._energy_fn(self._selector(data), self._sample_width) + return log_energy >= self._energy_threshold + + class StringDataSource(DataSource): """ - A class that represent a :class:`DataSource` as a string buffer. - Each call to :func:`DataSource.read` returns on character and moves one step forward. - If the end of the buffer is reached, :func:`read` returns None. - - :Parameters: - - `data` : - a basestring object. - + Class that represent a :class:`DataSource` as a string buffer. + Each call to :func:`DataSource.read` returns on character and moves one + step forward. If the end of the buffer is reached, :func:`read` returns + None. + + Parameters + ---------- + data : str + a string object used as data. + """ - + def __init__(self, data): self._data = None self._current = 0 self.set_data(data) - - + def read(self): """ Read one character from buffer. - - :Returns: - - Current character or None if end of buffer is reached + + Returns + ------- + char : str + current character or None if end of buffer is reached. """ - + if self._current >= len(self._data): return None self._current += 1 return self._data[self._current - 1] - + def set_data(self, data): """ Set a new data buffer. - - :Parameters: - - `data` : a basestring object - New data buffer. + + Parameters + ---------- + data : str + new data buffer. """ - - if not isinstance(data, basestring): - raise ValueError("data must an instance of basestring") + + if not isinstance(data, str): + raise ValueError("data must an instance of str") self._data = data self._current = 0 - class ADSFactory: """ - Factory class that makes it easy to create an :class:`ADSFactory.AudioDataSource` object that implements - :class:`DataSource` and can therefore be passed to :func:`auditok.core.StreamTokenizer.tokenize`. - - Whether you read audio data from a file, the microphone or a memory buffer, this factory - instantiates and returns the right :class:`ADSFactory.AudioDataSource` object. - - There are many other features you want your :class:`ADSFactory.AudioDataSource` object to have, such as: - memorize all read audio data so that you can rewind and reuse it (especially useful when - reading data from the microphone), read a fixed amount of data (also useful when reading - from the microphone), read overlapping audio frames (often needed when dosing a spectral - analysis of data). - - :func:`ADSFactory.ads` automatically creates and return object with the desired behavior according - to the supplied keyword arguments. - + .. deprecated:: 2.0.0 + `ADSFactory` will be removed in auditok 2.0.1, use instances of + :class:`AudioReader` instead. + + Factory class that makes it easy to create an + :class:`AudioDataSource` object that implements + :class:`DataSource` and can therefore be passed to + :func:`auditok.core.StreamTokenizer.tokenize`. + + Whether you read audio data from a file, the microphone or a memory buffer, + this factory instantiates and returns the right + :class:`AudioDataSource` object. + + There are many other features you want a :class:`AudioDataSource` object to + have, such as: memorize all read audio data so that you can rewind and reuse + it (especially useful when reading data from the microphone), read a fixed + amount of data (also useful when reading from the microphone), read + overlapping audio frames (often needed when dosing a spectral analysis of + data). + + :func:`ADSFactory.ads` automatically creates and return object with the + desired behavior according to the supplied keyword arguments. """ - - @staticmethod + + @staticmethod # noqa: C901 def _check_normalize_args(kwargs): - + for k in kwargs: - if not k in ["block_dur", "hop_dur", "block_size", "hop_size", "max_time", "record", - "audio_source", "filename", "data_buffer", "frames_per_buffer", "sampling_rate", - "sample_width", "channels", "sr", "sw", "ch", "asrc", "fn", "fpb", "db", "mt", - "rec", "bd", "hd", "bs", "hs"]: + if k not in [ + "block_dur", + "hop_dur", + "block_size", + "hop_size", + "max_time", + "record", + "audio_source", + "filename", + "data_buffer", + "frames_per_buffer", + "sampling_rate", + "sample_width", + "channels", + "sr", + "sw", + "ch", + "asrc", + "fn", + "fpb", + "db", + "mt", + "rec", + "bd", + "hd", + "bs", + "hs", + ]: raise ValueError("Invalid argument: {0}".format(k)) - + if "block_dur" in kwargs and "bd" in kwargs: - raise DuplicateArgument("Either 'block_dur' or 'bd' must be specified, not both") - + raise DuplicateArgument( + "Either 'block_dur' or 'bd' must be specified, not both" + ) + if "hop_dur" in kwargs and "hd" in kwargs: - raise DuplicateArgument("Either 'hop_dur' or 'hd' must be specified, not both") - + raise DuplicateArgument( + "Either 'hop_dur' or 'hd' must be specified, not both" + ) + if "block_size" in kwargs and "bs" in kwargs: - raise DuplicateArgument("Either 'block_size' or 'bs' must be specified, not both") - + raise DuplicateArgument( + "Either 'block_size' or 'bs' must be specified, not both" + ) + if "hop_size" in kwargs and "hs" in kwargs: - raise DuplicateArgument("Either 'hop_size' or 'hs' must be specified, not both") - + raise DuplicateArgument( + "Either 'hop_size' or 'hs' must be specified, not both" + ) + if "max_time" in kwargs and "mt" in kwargs: - raise DuplicateArgument("Either 'max_time' or 'mt' must be specified, not both") - + raise DuplicateArgument( + "Either 'max_time' or 'mt' must be specified, not both" + ) + if "audio_source" in kwargs and "asrc" in kwargs: - raise DuplicateArgument("Either 'audio_source' or 'asrc' must be specified, not both") - + raise DuplicateArgument( + "Either 'audio_source' or 'asrc' must be specified, not both" + ) + if "filename" in kwargs and "fn" in kwargs: - raise DuplicateArgument("Either 'filename' or 'fn' must be specified, not both") - + raise DuplicateArgument( + "Either 'filename' or 'fn' must be specified, not both" + ) + if "data_buffer" in kwargs and "db" in kwargs: - raise DuplicateArgument("Either 'filename' or 'db' must be specified, not both") - + raise DuplicateArgument( + "Either 'filename' or 'db' must be specified, not both" + ) + if "frames_per_buffer" in kwargs and "fbb" in kwargs: - raise DuplicateArgument("Either 'frames_per_buffer' or 'fpb' must be specified, not both") - + raise DuplicateArgument( + "Either 'frames_per_buffer' or 'fpb' must be specified, not " + "both" + ) + if "sampling_rate" in kwargs and "sr" in kwargs: - raise DuplicateArgument("Either 'sampling_rate' or 'sr' must be specified, not both") - + raise DuplicateArgument( + "Either 'sampling_rate' or 'sr' must be specified, not both" + ) + if "sample_width" in kwargs and "sw" in kwargs: - raise DuplicateArgument("Either 'sample_width' or 'sw' must be specified, not both") - + raise DuplicateArgument( + "Either 'sample_width' or 'sw' must be specified, not both" + ) + if "channels" in kwargs and "ch" in kwargs: - raise DuplicateArgument("Either 'channels' or 'ch' must be specified, not both") - + raise DuplicateArgument( + "Either 'channels' or 'ch' must be specified, not both" + ) + if "record" in kwargs and "rec" in kwargs: - raise DuplicateArgument("Either 'record' or 'rec' must be specified, not both") - - + raise DuplicateArgument( + "Either 'record' or 'rec' must be specified, not both" + ) + kwargs["bd"] = kwargs.pop("block_dur", None) or kwargs.pop("bd", None) kwargs["hd"] = kwargs.pop("hop_dur", None) or kwargs.pop("hd", None) kwargs["bs"] = kwargs.pop("block_size", None) or kwargs.pop("bs", None) kwargs["hs"] = kwargs.pop("hop_size", None) or kwargs.pop("hs", None) kwargs["mt"] = kwargs.pop("max_time", None) or kwargs.pop("mt", None) - kwargs["asrc"] = kwargs.pop("audio_source", None) or kwargs.pop("asrc", None) + kwargs["asrc"] = kwargs.pop("audio_source", None) or kwargs.pop( + "asrc", None + ) kwargs["fn"] = kwargs.pop("filename", None) or kwargs.pop("fn", None) kwargs["db"] = kwargs.pop("data_buffer", None) or kwargs.pop("db", None) - + record = kwargs.pop("record", False) if not record: record = kwargs.pop("rec", False) if not isinstance(record, bool): raise TypeError("'record' must be a boolean") - + kwargs["rec"] = record - - # keep long names for arguments meant for BufferAudioSource and PyAudioSource + + # keep long names for arguments meant for BufferAudioSource + # and PyAudioSource if "frames_per_buffer" in kwargs or "fpb" in kwargs: - kwargs["frames_per_buffer"] = kwargs.pop("frames_per_buffer", None) or kwargs.pop("fpb", None) - + kwargs["frames_per_buffer"] = kwargs.pop( + "frames_per_buffer", None + ) or kwargs.pop("fpb", None) + if "sampling_rate" in kwargs or "sr" in kwargs: - kwargs["sampling_rate"] = kwargs.pop("sampling_rate", None) or kwargs.pop("sr", None) - - if "sample_width" in kwargs or "sw" in kwargs: - kwargs["sample_width"] = kwargs.pop("sample_width", None) or kwargs.pop("sw", None) - + kwargs["sampling_rate"] = kwargs.pop( + "sampling_rate", None + ) or kwargs.pop("sr", None) + + if "sample_width" in kwargs or "sw" in kwargs: + kwargs["sample_width"] = kwargs.pop( + "sample_width", None + ) or kwargs.pop("sw", None) + if "channels" in kwargs or "ch" in kwargs: - kwargs["channels"] = kwargs.pop("channels", None) or kwargs.pop("ch", None) - - - - - - - + kwargs["channels"] = kwargs.pop("channels", None) or kwargs.pop( + "ch", None + ) + @staticmethod def ads(**kwargs): - """ - Create an return an :class:`ADSFactory.AudioDataSource`. The type and behavior of the object is the result - of the supplied parameters. - - :Parameters: - - *No parameters* : - read audio data from the available built-in microphone with the default parameters. - The returned :class:`ADSFactory.AudioDataSource` encapsulate an :class:`io.PyAudioSource` object and hence - it accepts the next four parameters are passed to use instead of their default values. - - `sampling_rate`, `sr` : *(int)* - number of samples per second. Default = 16000. - - `sample_width`, `sw` : *(int)* - number of bytes per sample (must be in (1, 2, 4)). Default = 2 - - `channels`, `ch` : *(int)* - number of audio channels. Default = 1 (only this value is currently accepted) - - `frames_per_buffer`, `fpb` : *(int)* - number of samples of PyAudio buffer. Default = 1024. - - `audio_source`, `asrc` : an `AudioSource` object - read data from this audio source - - `filename`, `fn` : *(string)* - build an `io.AudioSource` object using this file (currently only wave format is supported) - - `data_buffer`, `db` : *(string)* - build an `io.BufferAudioSource` using data in `data_buffer`. If this keyword is used, - `sampling_rate`, `sample_width` and `channels` are passed to `io.BufferAudioSource` - constructor and used instead of default values. - - `max_time`, `mt` : *(float)* - maximum time (in seconds) to read. Default behavior: read until there is no more data - available. - - `record`, `rec` : *(bool)* - save all read data in cache. Provide a navigable object which boasts a `rewind` method. - Default = False. - - `block_dur`, `bd` : *(float)* - processing block duration in seconds. This represents the quantity of audio data to return - each time the :func:`read` method is invoked. If `block_dur` is 0.025 (i.e. 25 ms) and the sampling - rate is 8000 and the sample width is 2 bytes, :func:`read` returns a buffer of 0.025 * 8000 * 2 = 400 - bytes at most. This parameter will be looked for (and used if available) before `block_size`. - If neither parameter is given, `block_dur` will be set to 0.01 second (i.e. 10 ms) - - - `hop_dur`, `hd` : *(float)* - quantity of data to skip from current processing window. if `hop_dur` is supplied then there - will be an overlap of `block_dur` - `hop_dur` between two adjacent blocks. This - parameter will be looked for (and used if available) before `hop_size`. If neither parameter - is given, `hop_dur` will be set to `block_dur` which means that there will be no overlap - between two consecutively read blocks. - - `block_size`, `bs` : *(int)* - number of samples to read each time the `read` method is called. Default: a block size - that represents a window of 10ms, so for a sampling rate of 16000, the default `block_size` - is 160 samples, for a rate of 44100, `block_size` = 441 samples, etc. - - `hop_size`, `hs` : *(int)* - determines the number of overlapping samples between two adjacent read windows. For a - `hop_size` of value *N*, the overlap is `block_size` - *N*. Default : `hop_size` = `block_size`, - means that there is no overlap. - - :Returns: - - An AudioDataSource object that has the desired features. - - :Exampels: - - 1. **Create an AudioDataSource that reads data from the microphone (requires Pyaudio) with default audio parameters:** - - .. code:: python - - from auditok import ADSFactory - ads = ADSFactory.ads() - ads.get_sampling_rate() - 16000 - ads.get_sample_width() - 2 - ads.get_channels() - 1 - - - 2. **Create an AudioDataSource that reads data from the microphone with a sampling rate of 48KHz:** - - .. code:: python - - from auditok import ADSFactory - ads = ADSFactory.ads(sr=48000) - ads.get_sampling_rate() - 48000 - - 3. **Create an AudioDataSource that reads data from a wave file:** - - .. code:: python - - import auditok - from auditok import ADSFactory - ads = ADSFactory.ads(fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) - ads.get_sampling_rate() - 44100 - ads.get_sample_width() - 2 - ads.get_channels() - 1 - - 4. **Define size of read blocks as 20 ms** - - .. code:: python - - import auditok - from auditok import ADSFactory - ''' - we know samling rate for previous file is 44100 samples/second - so 10 ms are equivalent to 441 samples and 20 ms to 882 - ''' - block_size = 882 - ads = ADSFactory.ads(bs = 882, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) - ads.open() - # read one block - data = ads.read() - ads.close() - len(data) - 1764 - assert len(data) == ads.get_sample_width() * block_size - - 5. **Define block size as a duration (use block_dur or bd):** - - .. code:: python - - import auditok - from auditok import ADSFactory - dur = 0.25 # second - ads = ADSFactory.ads(bd = dur, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) - ''' - we know samling rate for previous file is 44100 samples/second - for a block duration of 250 ms, block size should be 0.25 * 44100 = 11025 - ''' - ads.get_block_size() - 11025 - assert ads.get_block_size() == int(0.25 * 44100) - ads.open() - # read one block - data = ads.read() - ads.close() - len(data) - 22050 - assert len(data) == ads.get_sample_width() * ads.get_block_size() - - 6. **Read overlapping blocks (one of hope_size, hs, hop_dur or hd > 0):** - - For better readability we'd better use :class:`auditok.io.BufferAudioSource` with a string buffer: - - .. code:: python - - import auditok - from auditok import ADSFactory - ''' - we supply a data beffer instead of a file (keyword 'bata_buffer' or 'db') - sr : sampling rate = 16 samples/sec - sw : sample width = 1 byte - ch : channels = 1 - ''' - buffer = "abcdefghijklmnop" # 16 bytes = 1 second of data - bd = 0.250 # block duration = 250 ms = 4 bytes - hd = 0.125 # hop duration = 125 ms = 2 bytes - ads = ADSFactory.ads(db = "abcdefghijklmnop", bd = bd, hd = hd, sr = 16, sw = 1, ch = 1) - ads.open() - ads.read() - 'abcd' - ads.read() - 'cdef' - ads.read() - 'efgh' - ads.read() - 'ghij' - data = ads.read() - assert data == 'ijkl' - - 7. **Limit amount of read data (use max_time or mt):** - - .. code:: python - - ''' - We know audio file is larger than 2.25 seconds - We want to read up to 2.25 seconds of audio data - ''' - ads = ADSFactory.ads(mt = 2.25, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) - ads.open() - data = [] - while True: - d = ads.read() - if d is None: - break - data.append(d) - - ads.close() - data = b''.join(data) - assert len(data) == int(ads.get_sampling_rate() * 2.25 * ads.get_sample_width() * ads.get_channels()) + Create an return an :class:`AudioDataSource`. The type and + behavior of the object is the result + of the supplied parameters. Called without any parameters, the class + will read audio data from the available built-in microphone with the + default parameters. + + Parameters + ---------- + sampling_rate, sr : int, default: 16000 + number of audio samples per second of input audio stream. + sample_width, sw : int, default: 2 + number of bytes per sample, must be one of 1, 2 or 4 + channels, ch : int, default: 1 + number of audio channels, only a value of 1 is currently accepted. + frames_per_buffer, fpb : int, default: 1024 + number of samples of PyAudio buffer. + audio_source, asrc : `AudioSource` + `AudioSource` to read data from + filename, fn : str + create an `AudioSource` object using this file + data_buffer, db : str + build an `io.BufferAudioSource` using data in `data_buffer`. + If this keyword is used, + `sampling_rate`, `sample_width` and `channels` are passed to + `io.BufferAudioSource` constructor and used instead of default + values. + max_time, mt : float + maximum time (in seconds) to read. Default behavior: read until + there is no more data + available. + record, rec : bool, default = False + save all read data in cache. Provide a navigable object which has a + `rewind` method. + block_dur, bd : float + processing block duration in seconds. This represents the quantity + of audio data to return each time the :func:`read` method is + invoked. If `block_dur` is 0.025 (i.e. 25 ms) and the sampling rate + is 8000 and the sample width is 2 bytes, :func:`read` returns a + buffer of 0.025 * 8000 * 2 = 400 bytes at most. This parameter will + be looked for (and used if available) before `block_size`. If + neither parameter is given, `block_dur` will be set to 0.01 second + (i.e. 10 ms) + hop_dur, hd : float + quantity of data to skip from current processing window. if + `hop_dur` is supplied then there will be an overlap of `block_dur` + - `hop_dur` between two adjacent blocks. This parameter will be + looked for (and used if available) before `hop_size`. + If neither parameter is given, `hop_dur` will be set to `block_dur` + which means that there will be no overlap between two consecutively + read blocks. + block_size, bs : int + number of samples to read each time the `read` method is called. + Default: a block size that represents a window of 10ms, so for a + sampling rate of 16000, the default `block_size` is 160 samples, + for a rate of 44100, `block_size` = 441 samples, etc. + hop_size, hs : int + determines the number of overlapping samples between two adjacent + read windows. For a `hop_size` of value *N*, the overlap is + `block_size` - *N*. Default : `hop_size` = `block_size`, means that + there is no overlap. + + Returns + ------- + audio_data_source : AudioDataSource + an `AudioDataSource` object build with input parameters. """ - - # copy user's dicionary (shallow copy) - kwargs = kwargs.copy() - + warnings.warn( + "'ADSFactory' is deprecated and will be removed in a future " + "release. Please use AudioReader class instead.", + DeprecationWarning, + ) + # check and normalize keyword arguments ADSFactory._check_normalize_args(kwargs) - + block_dur = kwargs.pop("bd") hop_dur = kwargs.pop("hd") block_size = kwargs.pop("bs") @@ -452,431 +628,483 @@ class ADSFactory: filename = kwargs.pop("fn") data_buffer = kwargs.pop("db") record = kwargs.pop("rec") - + # Case 1: an audio source is supplied if audio_source is not None: if (filename, data_buffer) != (None, None): - raise Warning("You should provide one of 'audio_source', 'filename' or 'data_buffer'\ - keyword parameters. 'audio_source' will be used") - + raise Warning( + "You should provide one of 'audio_source', 'filename' or \ + 'data_buffer' keyword parameters. 'audio_source' will be \ + used" + ) + # Case 2: a file name is supplied elif filename is not None: if data_buffer is not None: - raise Warning("You should provide one of 'filename' or 'data_buffer'\ - keyword parameters. 'filename' will be used") + raise Warning( + "You should provide one of 'filename' or 'data_buffer'\ + keyword parameters. 'filename' will be used" + ) audio_source = from_file(filename) - - # Case 3: a data_buffer is supplied + + # Case 3: a data_buffer is supplied elif data_buffer is not None: - audio_source = BufferAudioSource(data_buffer = data_buffer, **kwargs) - + audio_source = BufferAudioSource(data=data_buffer, **kwargs) + # Case 4: try to access native audio input else: audio_source = PyAudioSource(**kwargs) - - + if block_dur is not None: if block_size is not None: - raise DuplicateArgument("Either 'block_dur' or 'block_size' can be specified, not both") - else: - block_size = int(audio_source.get_sampling_rate() * block_dur) - elif block_size is None: - # Set default block_size to 10 ms - block_size = int(audio_source.get_sampling_rate() / 100) - - # Instantiate base AudioDataSource - ads = ADSFactory.AudioDataSource(audio_source=audio_source, block_size=block_size) - - # Limit data to be read - if max_time is not None: - ads = ADSFactory.LimiterADS(ads=ads, max_time=max_time) - - # Record, rewind and reuse data - if record: - ads = ADSFactory.RecorderADS(ads=ads) - + raise DuplicateArgument( + "Either 'block_dur' or 'block_size' can be specified, not \ + both" + ) + elif block_size is not None: + block_dur = block_size / audio_source.sr + else: + block_dur = 0.01 # 10 ms + # Read overlapping blocks of data if hop_dur is not None: if hop_size is not None: - raise DuplicateArgument("Either 'hop_dur' or 'hop_size' can be specified, not both") - else: - hop_size = int(audio_source.get_sampling_rate() * hop_dur) - - if hop_size is not None: - if hop_size <= 0 or hop_size > block_size: - raise ValueError("hop_size must be > 0 and <= block_size") - if hop_size < block_size: - ads = ADSFactory.OverlapADS(ads=ads, hop_size=hop_size) - + raise DuplicateArgument( + "Either 'hop_dur' or 'hop_size' can be specified, not both" + ) + elif hop_size is not None: + hop_dur = hop_size / audio_source.sr + + ads = AudioDataSource( + audio_source, + block_dur=block_dur, + hop_dur=hop_dur, + record=record, + max_read=max_time, + ) return ads - - - class AudioDataSource(DataSource): - """ - Base class for AudioDataSource objects. - It inherits from DataSource and encapsulates an AudioSource object. - """ - - def __init__(self, audio_source, block_size): - - self.audio_source = audio_source - self.block_size = block_size - - def get_block_size(self): - return self.block_size - - def set_block_size(self, size): - self.block_size = size - - def get_audio_source(self): - return self.audio_source - - def set_audio_source(self, audio_source): - self.audio_source = audio_source - - def open(self): - self.audio_source.open() - - def close(self): - self.audio_source.close() - - def is_open(self): - return self.audio_source.is_open() - - def get_sampling_rate(self): - return self.audio_source.get_sampling_rate() - - def get_sample_width(self): - return self.audio_source.get_sample_width() - - def get_channels(self): - return self.audio_source.get_channels() - - - def rewind(self): - if isinstance(self.audio_source, Rewindable): - self.audio_source.rewind() - else: - raise Exception("Audio source is not rewindable") - - - - def is_rewindable(self): - return isinstance(self.audio_source, Rewindable) - - - def read(self): - return self.audio_source.read(self.block_size) - - - class ADSDecorator(AudioDataSource): - """ - Base decorator class for AudioDataSource objects. - """ - __metaclass__ = ABCMeta - - def __init__(self, ads): - self.ads = ads - - self.get_block_size = self.ads.get_block_size - self.set_block_size = self.ads.set_block_size - self.get_audio_source = self.ads.get_audio_source - self.open = self.ads.open - self.close = self.ads.close - self.is_open = self.ads.is_open - self.get_sampling_rate = self.ads.get_sampling_rate - self.get_sample_width = self.ads.get_sample_width - self.get_channels = self.ads.get_channels - - def is_rewindable(self): - return self.ads.is_rewindable - - def rewind(self): - self.ads.rewind() - self._reinit() - - def set_audio_source(self, audio_source): - self.ads.set_audio_source(audio_source) - self._reinit() - - def open(self): - if not self.ads.is_open(): - self.ads.open() - self._reinit() - - @abstractmethod - def _reinit(self): - pass - - - class OverlapADS(ADSDecorator): - """ - A class for AudioDataSource objects that can read and return overlapping audio frames - """ - - def __init__(self, ads, hop_size): - ADSFactory.ADSDecorator.__init__(self, ads) - - if hop_size <= 0 or hop_size > self.get_block_size(): - raise ValueError("hop_size must be either 'None' or \ - between 1 and block_size (both inclusive)") - self.hop_size = hop_size - self._actual_block_size = self.get_block_size() - self._reinit() - - - def _get_block_size(): - return self._actual_block_size - - - def _read_first_block(self): - # For the first call, we need an entire block of size 'block_size' - block = self.ads.read() - if block is None: - return None - - # Keep a slice of data in cache and append it in the next call - if len(block) > self._hop_size_bytes: - self._cache = block[self._hop_size_bytes:] - - # Up from the next call, we will use '_read_next_blocks' - # and we only read 'hop_size' - self.ads.set_block_size(self.hop_size) - self.read = self._read_next_blocks - - return block - - def _read_next_blocks(self): - block = self.ads.read() - if block is None: - return None - - # Append block to cache data to ensure overlap - block = self._cache + block - # Keep a slice of data in cache only if we have a full length block - # if we don't that means that this is the last block - if len(block) == self._block_size_bytes: - self._cache = block[self._hop_size_bytes:] - else: - self._cache = None - - return block - def read(self): - pass - - def _reinit(self): + +class _AudioReadingProxy: + def __init__(self, audio_source): + + self._audio_source = audio_source + + def rewind(self): + if self.rewindable: + self._audio_source.rewind() + else: + raise AudioIOError("Audio stream is not rewindable") + + def rewindable(self): + try: + return self._audio_source.rewindable + except AttributeError: + return False + + def is_open(self): + return self._audio_source.is_open() + + def open(self): + self._audio_source.open() + + def close(self): + self._audio_source.close() + + def read(self, size): + return self._audio_source.read(size) + + @property + def data(self): + err_msg = "This AudioReader is not a recorder, no recorded data can " + err_msg += "be retrieved" + raise AttributeError(err_msg) + + def __getattr__(self, name): + return getattr(self._audio_source, name) + + +class _Recorder(_AudioReadingProxy): + """ + Class for `AudioReader` objects that can record all data they read. Useful + when reading data from microphone. + """ + + def __init__(self, audio_source): + super(_Recorder, self).__init__(audio_source) + self._cache = [] + self._read_block = self._read_and_cache + self._read_from_cache = False + self._data = None + + def read(self, size): + return self._read_block(size) + + @property + def data(self): + if self._data is None: + err_msg = "Unrewinded recorder. `rewind` should be called before " + err_msg += "accessing recorded data" + raise RuntimeError(err_msg) + return self._data + + def rewindable(self): + return True + + def rewind(self): + if self._read_from_cache: + self._audio_source.rewind() + else: + self._data = b"".join(self._cache) self._cache = None - self.ads.set_block_size(self._actual_block_size) - self._hop_size_bytes = self.hop_size * \ - self.get_sample_width() * \ - self.get_channels() - self._block_size_bytes = self.get_block_size() * \ - self.get_sample_width() * \ - self.get_channels() - self.read = self._read_first_block + self._audio_source = BufferAudioSource( + self._data, self.sr, self.sw, self.ch + ) + self._read_block = self._audio_source.read + self.open() + self._read_from_cache = True + def _read_and_cache(self, size): + # Read and save read data + block = self._audio_source.read(size) + if block is not None: + self._cache.append(block) + return block - class LimiterADS(ADSDecorator): - """ - A class for AudioDataSource objects that can read a fixed amount of data. - This can be useful when reading data from the microphone or from large audio files. - """ - - def __init__(self, ads, max_time): - ADSFactory.ADSDecorator.__init__(self, ads) - - self.max_time = max_time - self._reinit() - - def read(self): - if self._total_read_bytes >= self._max_read_bytes: - return None - block = self.ads.read() - if block is None: - return None - self._total_read_bytes += len(block) - - if self._total_read_bytes >= self._max_read_bytes: - self.close() - - return block - - - def _reinit(self): - self._max_read_bytes = int(self.max_time * self.get_sampling_rate()) * \ - self.get_sample_width() * \ - self.get_channels() - self._total_read_bytes = 0 +class _Limiter(_AudioReadingProxy): + """ + Class for `AudioReader` objects that can read a fixed amount of data. + This can be useful when reading data from the microphone or from large + audio files. + """ - + def __init__(self, audio_source, max_read): + super(_Limiter, self).__init__(audio_source) + self._max_read = max_read + self._max_samples = round(max_read * self.sr) + self._bytes_per_sample = self.sw * self.ch + self._read_samples = 0 - class RecorderADS(ADSDecorator): - """ - A class for AudioDataSource objects that can record all audio data they read, - with a rewind facility. - """ - - def __init__(self, ads): - ADSFactory.ADSDecorator.__init__(self, ads) - - self._reinit() - - def read(self): - pass - - def _read_and_rec(self): - # Read and save read data - block = self.ads.read() - if block is not None: - self._cache.append(block) - + @property + def data(self): + data = self._audio_source.data + max_read_bytes = self._max_samples * self._bytes_per_sample + return data[:max_read_bytes] + + @property + def max_read(self): + return self._max_read + + def read(self, size): + size = min(self._max_samples - self._read_samples, size) + if size <= 0: + return None + block = self._audio_source.read(size) + if block is None: + return None + self._read_samples += len(block) // self._bytes_per_sample + return block + + def rewind(self): + super(_Limiter, self).rewind() + self._read_samples = 0 + + +class _FixedSizeAudioReader(_AudioReadingProxy): + """ + Class to read fixed-size audio windows from source. + """ + + def __init__(self, audio_source, block_dur): + super(_FixedSizeAudioReader, self).__init__(audio_source) + + if block_dur <= 0: + raise ValueError( + "block_dur must be > 0, given: {}".format(block_dur) + ) + + self._block_size = int(block_dur * self.sr) + if self._block_size == 0: + err_msg = "Too small block_dur ({0:f}) for sampling rate ({1}). " + err_msg += "block_dur should cover at least one sample " + err_msg += "(i.e. 1/{1})" + raise TooSamllBlockDuration( + err_msg.format(block_dur, self.sr), block_dur, self.sr + ) + + def read(self): + return self._audio_source.read(self._block_size) + + @property + def block_size(self): + return self._block_size + + @property + def block_dur(self): + return self._block_size / self.sr + + def __getattr__(self, name): + return getattr(self._audio_source, name) + + +class _OverlapAudioReader(_FixedSizeAudioReader): + """ + Class for `AudioReader` objects that can read and return overlapping audio + windows. + """ + + def __init__(self, audio_source, block_dur, hop_dur): + + if hop_dur >= block_dur: + raise ValueError('"hop_dur" should be < "block_dur"') + + super(_OverlapAudioReader, self).__init__(audio_source, block_dur) + + self._hop_size = int(hop_dur * self.sr) + self._blocks = self._iter_blocks_with_overlap() + + def _iter_blocks_with_overlap(self): + while not self.is_open(): + yield AudioIOError + block = self._audio_source.read(self._block_size) + if block is None: + yield None + + _hop_size_bytes = ( + self._hop_size * self._audio_source.sw * self._audio_source.ch + ) + cache = block[_hop_size_bytes:] + yield block + + while True: + block = self._audio_source.read(self._hop_size) + if block: + block = cache + block + cache = block[_hop_size_bytes:] + yield block + continue + yield None + + def read(self): + try: + block = next(self._blocks) + if block == AudioIOError: + raise AudioIOError("Audio Stream is not open.") return block - - - def _read_simple(self): - # Read without recording - return self.ads.read() - - def rewind(self): - if self._record: - # If has been recording, create a new BufferAudioSource - # from recorded data - dbuffer = self._concatenate(self._cache) - asource = BufferAudioSource(dbuffer, self.get_sampling_rate(), - self.get_sample_width(), - self.get_channels()) - - - self.set_audio_source(asource) - self.open() - self._cache = [] - self._record = False - self.read = self._read_simple - - else: - self.ads.rewind() - if not self.is_open(): - self.open() - - - def is_rewindable(self): - return True - - def _reinit(self): - # when audio_source is replaced, start recording again - self._record = True - self._cache = [] - self.read = self._read_and_rec - - def _concatenate(self, data): - try: - # should always work for python 2 - # work for python 3 ONLY if data is a list (or an iterator) - # whose each element is a 'bytes' objects - return b''.join(data) - except TypeError: - # work for 'str' in python 2 and python 3 - return ''.join(data) + except StopIteration: + return None + def rewind(self): + super(_OverlapAudioReader, self).rewind() + self._blocks = self._iter_blocks_with_overlap() -class AudioEnergyValidator(DataValidator): + @property + def hop_size(self): + return self._hop_size + + @property + def hop_dur(self): + return self._hop_size / self.sr + + def __getattr__(self, name): + return getattr(self._audio_source, name) + + +class AudioReader(DataSource): """ - The most basic auditok audio frame validator. - This validator computes the log energy of an input audio frame - and return True if the result is >= a given threshold, False - otherwise. - - :Parameters: - - `sample_width` : *(int)* - Number of bytes of one audio sample. This is used to convert data from `basestring` or `Bytes` to - an array of floats. - - `energy_threshold` : *(float)* - A threshold used to check whether an input data buffer is valid. + Class to read fixed-size chunks of audio data from a source. A source can + be a file on disk, standard input (with `input` = "-") or microphone. This + is normally used by tokenization algorithms that expect source objects with + a `read` function that returns a windows of data of the same size at each + call expect when remaining data does not make up a full window. + + Objects of this class can be set up to return audio windows with a given + overlap and to record the whole stream for later access (useful when + reading data from the microphone). They can also have + a limit for the maximum amount of data to read. + + Parameters + ---------- + input : str, bytes, AudioSource, AudioReader, AudioRegion or None + input audio data. If the type of the passed argument is `str`, it should + be a path to an existing audio file. "-" is interpreted as standardinput. + If the type is `bytes`, input is considered as a buffer of raw audio + data. If None, read audio from microphone. Every object that is not an + :class:`AudioReader` will be transformed, when possible, into an + :class:`AudioSource` before processing. If it is an `str` that refers to + a raw audio file, `bytes` or None, audio parameters should be provided + using kwargs (i.e., `samplig_rate`, `sample_width` and `channels` or + their alias). + block_dur: float, default: 0.01 + length in seconds of audio windows to return at each `read` call. + hop_dur: float, default: None + length in seconds of data amount to skip from previous window. If + defined, it is used to compute the temporal overlap between previous and + current window (nameply `overlap = block_dur - hop_dur`). Default, None, + means that consecutive windows do not overlap. + record: bool, default: False + whether to record read audio data for later access. If True, audio data + can be retrieved by first calling `rewind()`, then using the `data` + property. Note that once `rewind()` is called, no new data will be read + from source (subsequent `read()` call will read data from cache) and + that there's no need to call `rewind()` again to access `data` property. + max_read: float, default: None + maximum amount of audio data to read in seconds. Default is None meaning + that data will be read until end of stream is reached or, when reading + from microphone a Ctrl-C is sent. + + When `input` is None, of type bytes or a raw audio files some of the + follwing kwargs are mandatory. + + Other Parameters + ---------------- + audio_format, fmt : str + type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be + used if `input` is a string path to an audio file. If not given, audio + type will be guessed from file name extension or from file header. + sampling_rate, sr : int + sampling rate of audio data. Required if `input` is a raw audio file, is + a bytes object or None (i.e., read from microphone). + sample_width, sw : int + number of bytes used to encode one audio sample, typically 1, 2 or 4. + Required for raw data, see `sampling_rate`. + channels, ch : int + number of channels of audio data. Required for raw data, see + `sampling_rate`. + use_channel, uc : {None, "any", "mix", "avg", "average"} or int + which channel to use for split if `input` has multiple audio channels. + Regardless of which channel is used for splitting, returned audio events + contain data from *all* the channels of `input`. The following values + are accepted: + + - None (alias "any"): accept audio activity from any channel, even if + other channels are silent. This is the default behavior. + + - "mix" (alias "avg" or "average"): mix down all channels (i.e., compute + average channel) and split the resulting channel. + + - int (>= 0 , < `channels`): use one channel, specified by its integer + id, for split. + + large_file : bool, default: False + If True, AND if `input` is a path to a *wav* of a *raw* audio file + (and only these two formats) then audio data is lazily loaded to memory + (i.e., one analysis window a time). Otherwise the whole file is loaded + to memory before split. Set to True if the size of the file is larger + than available memory. + """ + + def __init__( + self, + input, + block_dur=0.01, + hop_dur=None, + record=False, + max_read=None, + **kwargs + ): + if not isinstance(input, AudioSource): + input = get_audio_source(input, **kwargs) + self._record = record + if record: + input = _Recorder(input) + if max_read is not None: + input = _Limiter(input, max_read) + self._max_read = max_read + if hop_dur is not None: + input = _OverlapAudioReader(input, block_dur, hop_dur) + else: + input = _FixedSizeAudioReader(input, block_dur) + self._audio_source = input + + def __repr__(self): + block_dur, hop_dur, max_read = None, None, None + if self.block_dur is not None: + block_dur = "{:.3f}".format(self.block_dur) + if self.hop_dur is not None: + hop_dur = "{:.3f}".format(self.hop_dur) + if self.max_read is not None: + max_read = "{:.3f}".format(self.max_read) + return ( + "{cls}(block_dur={block_dur}, " + "hop_dur={hop_dur}, record={rewindable}, " + "max_read={max_read})" + ).format( + cls=self.__class__.__name__, + block_dur=block_dur, + hop_dur=hop_dur, + rewindable=self._record, + max_read=max_read, + ) + + @property + def rewindable(self): + return self._record + + @property + def block_dur(self): + return self._audio_source.block_size / self._audio_source.sr + + @property + def hop_dur(self): + if hasattr(self._audio_source, "hop_dur"): + return self._audio_source.hop_size / self._audio_source.sr + return self.block_dur + + @property + def hop_size(self): + if hasattr(self._audio_source, "hop_size"): + return self._audio_source.hop_size + return self.block_size + + @property + def max_read(self): + try: + return self._audio_source.max_read + except AttributeError: + return None + + def read(self): + return self._audio_source.read() + + def __getattr__(self, name): + if name in ("data", "rewind") and not self.rewindable: + raise AttributeError( + "'AudioReader' has no attribute '{}'".format(name) + ) + try: + return getattr(self._audio_source, name) + except AttributeError: + raise AttributeError( + "'AudioReader' has no attribute '{}'".format(name) + ) + + +# Keep AudioDataSource for compatibility +# Remove in a future version when ADSFactory is removed +AudioDataSource = AudioReader + + +class Recorder(AudioReader): + """Class to read fixed-size chunks of audio data from a source and keeps + data in a cache. Using this class is equivalent to initializing + :class:`AudioReader` with `record=True`. For more information about the + other parameters see :class:`AudioReader`. + + Once the desired amount of data is read, you can call the :func:`rewind` + method then get the recorded data via the :attr:`data` attribute. You can also + re-read cached data one window a time by calling :func:`read`. """ - - - if _WITH_NUMPY: - - _formats = {1: numpy.int8 , 2: numpy.int16, 4: numpy.int32} - - @staticmethod - def _convert(signal, sample_width): - return numpy.array(numpy.frombuffer(signal, dtype=AudioEnergyValidator._formats[sample_width]), dtype=numpy.float64) - - @staticmethod - def _signal_energy(signal): - return float(numpy.dot(signal, signal)) / len(signal) - - @staticmethod - def _signal_log_energy(signal): - energy = AudioEnergyValidator._signal_energy(signal) - if energy <= 0: - return -200 - return 10. * numpy.log10(energy) - - else: - - - _formats = {1: 'b' , 2: 'h', 4: 'i'} - - @staticmethod - def _convert(signal, sample_width): - return array("d", array(AudioEnergyValidator._formats[sample_width], signal)) - - @staticmethod - def _signal_energy(signal): - energy = 0. - for a in signal: - energy += a * a - return energy / len(signal) - - @staticmethod - def _signal_log_energy(signal): - energy = AudioEnergyValidator._signal_energy(signal) - if energy <= 0: - return -200 - return 10. * math.log10(energy) - - - def __init__(self, sample_width, energy_threshold=45): - self.sample_width = sample_width - self._energy_threshold = energy_threshold - - - def is_valid(self, data): - """ - Check if data is valid. Audio data will be converted into an array (of - signed values) of which the log energy is computed. Log energy is computed - as follows: - - .. code:: python - - arr = AudioEnergyValidator._convert(signal, sample_width) - energy = float(numpy.dot(arr, arr)) / len(arr) - log_energy = 10. * numpy.log10(energy) - - - :Parameters: - - `data` : either a *string* or a *Bytes* buffer - `data` is converted into a numerical array using the `sample_width` - given in the constructor. - - :Retruns: - - True if `log_energy` >= `energy_threshold`, False otherwise. - """ - - signal = AudioEnergyValidator._convert(data, self.sample_width) - return AudioEnergyValidator._signal_log_energy(signal) >= self._energy_threshold - - def get_energy_threshold(self): - return self._energy_threshold - - def set_energy_threshold(self, threshold): - self._energy_threshold = threshold + def __init__( + self, input, block_dur=0.01, hop_dur=None, max_read=None, **kwargs + ): + super().__init__( + input, + block_dur=block_dur, + hop_dur=hop_dur, + record=True, + max_read=max_read, + **kwargs + ) diff --git a/libs/auditok/workers.py b/libs/auditok/workers.py new file mode 100755 index 000000000..bb6d54a98 --- /dev/null +++ b/libs/auditok/workers.py @@ -0,0 +1,427 @@ +import os +import sys +from tempfile import NamedTemporaryFile +from abc import ABCMeta, abstractmethod +from threading import Thread +from datetime import datetime, timedelta +from collections import namedtuple +import wave +import subprocess +from queue import Queue, Empty +from .io import _guess_audio_format +from .util import AudioDataSource, make_duration_formatter +from .core import split +from .exceptions import ( + EndOfProcessing, + AudioEncodingError, + AudioEncodingWarning, +) + + +_STOP_PROCESSING = "STOP_PROCESSING" +_Detection = namedtuple("_Detection", "id start end duration") + + +def _run_subprocess(command): + try: + with subprocess.Popen( + command, + stdin=open(os.devnull, "rb"), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) as proc: + stdout, stderr = proc.communicate() + return proc.returncode, stdout, stderr + except Exception: + err_msg = "Couldn't export audio using command: '{}'".format(command) + raise AudioEncodingError(err_msg) + + +class Worker(Thread, metaclass=ABCMeta): + def __init__(self, timeout=0.5, logger=None): + self._timeout = timeout + self._logger = logger + self._inbox = Queue() + Thread.__init__(self) + + def run(self): + while True: + message = self._get_message() + if message == _STOP_PROCESSING: + break + if message is not None: + self._process_message(message) + self._post_process() + + @abstractmethod + def _process_message(self, message): + """Process incoming messages""" + + def _post_process(self): + pass + + def _log(self, message): + self._logger.info(message) + + def _stop_requested(self): + try: + message = self._inbox.get_nowait() + if message == _STOP_PROCESSING: + return True + except Empty: + return False + + def stop(self): + self.send(_STOP_PROCESSING) + self.join() + + def send(self, message): + self._inbox.put(message) + + def _get_message(self): + try: + message = self._inbox.get(timeout=self._timeout) + return message + except Empty: + return None + + +class TokenizerWorker(Worker, AudioDataSource): + def __init__(self, reader, observers=None, logger=None, **kwargs): + self._observers = observers if observers is not None else [] + self._reader = reader + self._audio_region_gen = split(self, **kwargs) + self._detections = [] + self._log_format = "[DET]: Detection {0.id} (start: {0.start:.3f}, " + self._log_format += "end: {0.end:.3f}, duration: {0.duration:.3f})" + Worker.__init__(self, timeout=0.2, logger=logger) + + def _process_message(self): + pass + + @property + def detections(self): + return self._detections + + def _notify_observers(self, message): + for observer in self._observers: + observer.send(message) + + def run(self): + self._reader.open() + start_processing_timestamp = datetime.now() + for _id, audio_region in enumerate(self._audio_region_gen, start=1): + timestamp = start_processing_timestamp + timedelta( + seconds=audio_region.meta.start + ) + audio_region.meta.timestamp = timestamp + detection = _Detection( + _id, + audio_region.meta.start, + audio_region.meta.end, + audio_region.duration, + ) + self._detections.append(detection) + if self._logger is not None: + message = self._log_format.format(detection) + self._log(message) + self._notify_observers((_id, audio_region)) + self._notify_observers(_STOP_PROCESSING) + self._reader.close() + + def start_all(self): + for observer in self._observers: + observer.start() + self.start() + + def stop_all(self): + self.stop() + for observer in self._observers: + observer.stop() + self._reader.close() + + def read(self): + if self._stop_requested(): + return None + else: + return self._reader.read() + + def __getattr__(self, name): + return getattr(self._reader, name) + + +class StreamSaverWorker(Worker): + def __init__( + self, + audio_reader, + filename, + export_format=None, + cache_size_sec=0.5, + timeout=0.2, + ): + self._reader = audio_reader + sample_size_bytes = self._reader.sw * self._reader.ch + self._cache_size = cache_size_sec * self._reader.sr * sample_size_bytes + self._output_filename = filename + self._export_format = _guess_audio_format(export_format, filename) + if self._export_format is None: + self._export_format = "wav" + self._init_output_stream() + self._exported = False + self._cache = [] + self._total_cached = 0 + Worker.__init__(self, timeout=timeout) + + def _get_non_existent_filename(self): + filename = self._output_filename + ".wav" + i = 0 + while os.path.exists(filename): + i += 1 + filename = self._output_filename + "({}).wav".format(i) + return filename + + def _init_output_stream(self): + if self._export_format != "wav": + self._tmp_output_filename = self._get_non_existent_filename() + else: + self._tmp_output_filename = self._output_filename + self._wfp = wave.open(self._tmp_output_filename, "wb") + self._wfp.setframerate(self._reader.sr) + self._wfp.setsampwidth(self._reader.sw) + self._wfp.setnchannels(self._reader.ch) + + @property + def sr(self): + return self._reader.sampling_rate + + @property + def sw(self): + return self._reader.sample_width + + @property + def ch(self): + return self._reader.channels + + def __del__(self): + self._post_process() + + if ( + (self._tmp_output_filename != self._output_filename) + and self._exported + and os.path.exists(self._tmp_output_filename) + ): + os.remove(self._tmp_output_filename) + + def _process_message(self, data): + self._cache.append(data) + self._total_cached += len(data) + if self._total_cached >= self._cache_size: + self._write_cached_data() + + def _post_process(self): + while True: + try: + data = self._inbox.get_nowait() + if data != _STOP_PROCESSING: + self._cache.append(data) + self._total_cached += len(data) + except Empty: + break + self._write_cached_data() + self._wfp.close() + + def _write_cached_data(self): + if self._cache: + data = b"".join(self._cache) + self._wfp.writeframes(data) + self._cache = [] + self._total_cached = 0 + + def open(self): + self._reader.open() + + def close(self): + self._reader.close() + self.stop() + + def rewind(self): + # ensure compatibility with AudioDataSource with record=True + pass + + @property + def data(self): + with wave.open(self._tmp_output_filename, "rb") as wfp: + return wfp.readframes(-1) + + def save_stream(self): + if self._exported: + return self._output_filename + + if self._export_format in ("raw", "wav"): + if self._export_format == "raw": + self._export_raw() + self._exported = True + return self._output_filename + try: + self._export_with_ffmpeg_or_avconv() + except AudioEncodingError: + try: + self._export_with_sox() + except AudioEncodingError: + warn_msg = "Couldn't save audio data in the desired format " + warn_msg += "'{}'. Either none of 'ffmpeg', 'avconv' or 'sox' " + warn_msg += "is installed or this format is not recognized.\n" + warn_msg += "Audio file was saved as '{}'" + raise AudioEncodingWarning( + warn_msg.format( + self._export_format, self._tmp_output_filename + ) + ) + finally: + self._exported = True + return self._output_filename + + def _export_raw(self): + with open(self._output_filename, "wb") as wfp: + wfp.write(self.data) + + def _export_with_ffmpeg_or_avconv(self): + command = [ + "-y", + "-f", + "wav", + "-i", + self._tmp_output_filename, + "-f", + self._export_format, + self._output_filename, + ] + returncode, stdout, stderr = _run_subprocess(["ffmpeg"] + command) + if returncode != 0: + returncode, stdout, stderr = _run_subprocess(["avconv"] + command) + if returncode != 0: + raise AudioEncodingError(stderr) + return stdout, stderr + + def _export_with_sox(self): + command = [ + "sox", + "-t", + "wav", + self._tmp_output_filename, + self._output_filename, + ] + returncode, stdout, stderr = _run_subprocess(command) + if returncode != 0: + raise AudioEncodingError(stderr) + return stdout, stderr + + def close_output(self): + self._wfp.close() + + def read(self): + data = self._reader.read() + if data is not None: + self.send(data) + else: + self.send(_STOP_PROCESSING) + return data + + def __getattr__(self, name): + if name == "data": + return self.data + return getattr(self._reader, name) + + +class PlayerWorker(Worker): + def __init__(self, player, progress_bar=False, timeout=0.2, logger=None): + self._player = player + self._progress_bar = progress_bar + self._log_format = "[PLAY]: Detection {id} played" + Worker.__init__(self, timeout=timeout, logger=logger) + + def _process_message(self, message): + _id, audio_region = message + if self._logger is not None: + message = self._log_format.format(id=_id) + self._log(message) + audio_region.play( + player=self._player, progress_bar=self._progress_bar, leave=False + ) + + +class RegionSaverWorker(Worker): + def __init__( + self, + filename_format, + audio_format=None, + timeout=0.2, + logger=None, + **audio_parameters + ): + self._filename_format = filename_format + self._audio_format = audio_format + self._audio_parameters = audio_parameters + self._debug_format = "[SAVE]: Detection {id} saved as '{filename}'" + Worker.__init__(self, timeout=timeout, logger=logger) + + def _process_message(self, message): + _id, audio_region = message + filename = self._filename_format.format( + id=_id, + start=audio_region.meta.start, + end=audio_region.meta.end, + duration=audio_region.duration, + ) + filename = audio_region.save( + filename, self._audio_format, **self._audio_parameters + ) + if self._logger: + message = self._debug_format.format(id=_id, filename=filename) + self._log(message) + + +class CommandLineWorker(Worker): + def __init__(self, command, timeout=0.2, logger=None): + self._command = command + Worker.__init__(self, timeout=timeout, logger=logger) + self._debug_format = "[COMMAND]: Detection {id} command: '{command}'" + + def _process_message(self, message): + _id, audio_region = message + with NamedTemporaryFile(delete=False) as file: + filename = audio_region.save(file.name, audio_format="wav") + command = self._command.format(file=filename) + os.system(command) + if self._logger is not None: + message = self._debug_format.format(id=_id, command=command) + self._log(message) + + +class PrintWorker(Worker): + def __init__( + self, + print_format="{start} {end}", + time_format="%S", + timestamp_format="%Y/%m/%d %H:%M:%S.%f", + timeout=0.2, + ): + + self._print_format = print_format + self._format_time = make_duration_formatter(time_format) + self._timestamp_format = timestamp_format + self.detections = [] + Worker.__init__(self, timeout=timeout) + + def _process_message(self, message): + _id, audio_region = message + timestamp = audio_region.meta.timestamp + timestamp = timestamp.strftime(self._timestamp_format) + text = self._print_format.format( + id=_id, + start=self._format_time(audio_region.meta.start), + end=self._format_time(audio_region.meta.end), + duration=self._format_time(audio_region.duration), + timestamp=timestamp, + ) + print(text) |