summaryrefslogtreecommitdiffhomepage
path: root/libs/auditok
diff options
context:
space:
mode:
Diffstat (limited to 'libs/auditok')
-rw-r--r--libs/auditok/__init__.py10
-rwxr-xr-xlibs/auditok/cmdline.py1155
-rwxr-xr-xlibs/auditok/cmdline_util.py126
-rw-r--r--libs/auditok/core.py1656
-rw-r--r--libs/auditok/dataset.py24
-rw-r--r--libs/auditok/exceptions.py42
-rw-r--r--libs/auditok/io.py1264
-rwxr-xr-xlibs/auditok/plotting.py150
-rw-r--r--libs/auditok/signal.py179
-rw-r--r--libs/auditok/signal_numpy.py30
-rw-r--r--libs/auditok/util.py1734
-rwxr-xr-xlibs/auditok/workers.py427
12 files changed, 4572 insertions, 2225 deletions
diff --git a/libs/auditok/__init__.py b/libs/auditok/__init__.py
index 4ea697b77..edd336cc3 100644
--- a/libs/auditok/__init__.py
+++ b/libs/auditok/__init__.py
@@ -2,20 +2,16 @@
:author:
Amine SEHILI <[email protected]>
-2015-2016
+2015-2021
:License:
-This package is published under GNU GPL Version 3.
+This package is published under the MIT license.
"""
-from __future__ import absolute_import
from .core import *
from .io import *
from .util import *
-from . import dataset
from .exceptions import *
-__version__ = "0.1.5"
-
-
+__version__ = "0.2.0"
diff --git a/libs/auditok/cmdline.py b/libs/auditok/cmdline.py
index b6a51d11b..7e7450762 100755
--- a/libs/auditok/cmdline.py
+++ b/libs/auditok/cmdline.py
@@ -1,789 +1,428 @@
#!/usr/bin/env python
# encoding: utf-8
-'''
-auditok.auditok -- Audio Activity Detection tool
-
-auditok.auditok is a program that can be used for Audio/Acoustic activity detection.
-It can read audio data from audio files as well as from built-in device(s) or standard input
+"""
+`auditok` -- An Audio Activity Detection tool
+`auditok` is a program that can be used for Audio/Acoustic
+activity detection. It can read audio data from audio files as well
+as from the microphone or standard input.
@author: Mohamed El Amine SEHILI
-
-@copyright: 2015 Mohamed El Amine SEHILI
-
-@license: GPL v3
-
+@copyright: 2015-2021 Mohamed El Amine SEHILI
+@license: MIT
-@deffield updated: 02 Dec 2015
-'''
+@deffield updated: 01 Mar 2021
+"""
import sys
import os
-
-from optparse import OptionParser, OptionGroup
-from threading import Thread
-import tempfile
-import wave
+from argparse import ArgumentParser
import time
import threading
-import logging
-try:
- import future
- from queue import Queue, Empty
-except ImportError:
- if sys.version_info >= (3, 0):
- from queue import Queue, Empty
- else:
- from Queue import Queue, Empty
+from auditok import __version__, AudioRegion
+from .util import AudioDataSource
+from .exceptions import EndOfProcessing, AudioEncodingWarning
+from .io import player_for
+from .cmdline_util import make_logger, make_kwargs, initialize_workers
+from . import workers
-try:
- from pydub import AudioSegment
- WITH_PYDUB = True
-except ImportError:
- WITH_PYDUB = False
-
-
-from .core import StreamTokenizer
-from .io import PyAudioSource, BufferAudioSource, StdinAudioSource, player_for
-from .util import ADSFactory, AudioEnergyValidator
-from auditok import __version__ as version
__all__ = []
-__version__ = version
-__date__ = '2015-11-23'
-__updated__ = '2015-03-11'
-
-DEBUG = 0
-TESTRUN = 1
-PROFILE = 0
-
-LOGGER_NAME = "AUDITOK_LOGGER"
-
-class AudioFileFormatError(Exception):
- pass
-
-class TimeFormatError(Exception):
- pass
-
-def file_to_audio_source(filename, filetype=None, **kwargs):
-
- lower_fname = filename.lower()
- rawdata = False
-
- if filetype is not None:
- filetype = filetype.lower()
-
- if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")):
-
- srate = kwargs.pop("sampling_rate", None)
- if srate is None:
- srate = kwargs.pop("sr", None)
-
- swidth = kwargs.pop("sample_width", None)
- if swidth is None:
- swidth = kwargs.pop("sw", None)
-
- ch = kwargs.pop("channels", None)
- if ch is None:
- ch = kwargs.pop("ch", None)
-
- if None in (swidth, srate, ch):
- raise Exception("All audio parameters are required for raw data")
-
- data = open(filename).read()
- rawdata = True
-
- # try first with pydub
- if WITH_PYDUB:
-
- use_channel = kwargs.pop("use_channel", None)
- if use_channel is None:
- use_channel = kwargs.pop("uc", None)
-
- if use_channel is None:
- use_channel = 1
- else:
- try:
- use_channel = int(use_channel)
- except ValueError:
- pass
-
- if not isinstance(use_channel, (int)) and not use_channel.lower() in ["left", "right", "mix"] :
- raise ValueError("channel must be an integer or one of 'left', 'right' or 'mix'")
-
- asegment = None
-
- if rawdata:
- asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch)
- if filetype in("wave", "wav") or (filetype is None and lower_fname.endswith(".wav")):
- asegment = AudioSegment.from_wav(filename)
- elif filetype == "mp3" or (filetype is None and lower_fname.endswith(".mp3")):
- asegment = AudioSegment.from_mp3(filename)
- elif filetype == "ogg" or (filetype is None and lower_fname.endswith(".ogg")):
- asegment = AudioSegment.from_ogg(filename)
- elif filetype == "flv" or (filetype is None and lower_fname.endswith(".flv")):
- asegment = AudioSegment.from_flv(filename)
- else:
- asegment = AudioSegment.from_file(filename)
-
- if asegment.channels > 1:
-
- if isinstance(use_channel, int):
- if use_channel > asegment.channels:
- raise ValueError("Can not use channel '{0}', audio file has only {1} channels".format(use_channel, asegment.channels))
- else:
- asegment = asegment.split_to_mono()[use_channel - 1]
- else:
- ch_lower = use_channel.lower()
-
- if ch_lower == "mix":
- asegment = asegment.set_channels(1)
-
- elif use_channel.lower() == "left":
- asegment = asegment.split_to_mono()[0]
-
- elif use_channel.lower() == "right":
- asegment = asegment.split_to_mono()[1]
-
- return BufferAudioSource(data_buffer = asegment._data,
- sampling_rate = asegment.frame_rate,
- sample_width = asegment.sample_width,
- channels = asegment.channels)
- # fall back to standard python
- else:
- if rawdata:
- if ch != 1:
- raise ValueError("Cannot handle multi-channel audio without pydub")
- return BufferAudioSource(data, srate, swidth, ch)
-
- if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")):
-
- wfp = wave.open(filename)
-
- ch = wfp.getnchannels()
- if ch != 1:
- wfp.close()
- raise ValueError("Cannot handle multi-channel audio without pydub")
-
- srate = wfp.getframerate()
- swidth = wfp.getsampwidth()
- data = wfp.readframes(wfp.getnframes())
- wfp.close()
- return BufferAudioSource(data, srate, swidth, ch)
-
- raise AudioFileFormatError("Cannot read audio file format")
-
-
-def save_audio_data(data, filename, filetype=None, **kwargs):
-
- lower_fname = filename.lower()
- if filetype is not None:
- filetype = filetype.lower()
-
- # save raw data
- if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")):
- fp = open(filename, "w")
- fp.write(data)
- fp.close()
- return
-
- # save other types of data
- # requires all audio parameters
- srate = kwargs.pop("sampling_rate", None)
- if srate is None:
- srate = kwargs.pop("sr", None)
-
- swidth = kwargs.pop("sample_width", None)
- if swidth is None:
- swidth = kwargs.pop("sw", None)
-
- ch = kwargs.pop("channels", None)
- if ch is None:
- ch = kwargs.pop("ch", None)
-
- if None in (swidth, srate, ch):
- raise Exception("All audio parameters are required to save no raw data")
-
- if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")):
- # use standard python's wave module
- fp = wave.open(filename, "w")
- fp.setnchannels(ch)
- fp.setsampwidth(swidth)
- fp.setframerate(srate)
- fp.writeframes(data)
- fp.close()
-
- elif WITH_PYDUB:
-
- asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch)
- asegment.export(filename, format=filetype)
-
- else:
- raise AudioFileFormatError("cannot write file format {0} (file name: {1})".format(filetype, filename))
-
-
-def plot_all(signal, sampling_rate, energy_as_amp, detections=[], show=True, save_as=None):
-
- import matplotlib.pyplot as plt
- import numpy as np
- t = np.arange(0., np.ceil(float(len(signal))) / sampling_rate, 1./sampling_rate )
- if len(t) > len(signal):
- t = t[: len(signal) - len(t)]
-
- for start, end in detections:
- p = plt.axvspan(start, end, facecolor='g', ec = 'r', lw = 2, alpha=0.4)
-
- line = plt.axhline(y=energy_as_amp, lw=1, ls="--", c="r", label="Energy threshold as normalized amplitude")
- plt.plot(t, signal)
- legend = plt.legend(["Detection threshold"], bbox_to_anchor=(0., 1.02, 1., .102), loc=1, fontsize=16)
- ax = plt.gca().add_artist(legend)
-
- plt.xlabel("Time (s)", fontsize=24)
- plt.ylabel("Amplitude (normalized)", fontsize=24)
-
- if save_as is not None:
- plt.savefig(save_as, dpi=120)
-
- if show:
- plt.show()
-
-
-def seconds_to_str_fromatter(_format):
- """
- Accepted format directives: %i %s %m %h
- """
- # check directives are correct
-
- if _format == "%S":
- def _fromatter(seconds):
- return "{:.2f}".format(seconds)
-
- elif _format == "%I":
- def _fromatter(seconds):
- return "{0}".format(int(seconds * 1000))
-
- else:
- _format = _format.replace("%h", "{hrs:02d}")
- _format = _format.replace("%m", "{mins:02d}")
- _format = _format.replace("%s", "{secs:02d}")
- _format = _format.replace("%i", "{millis:03d}")
-
- try:
- i = _format.index("%")
- raise TimeFormatError("Unknow time format directive '{0}'".format(_format[i:i+2]))
- except ValueError:
- pass
-
- def _fromatter(seconds):
- millis = int(seconds * 1000)
- hrs, millis = divmod(millis, 3600000)
- mins, millis = divmod(millis, 60000)
- secs, millis = divmod(millis, 1000)
- return _format.format(hrs=hrs, mins=mins, secs=secs, millis=millis)
-
- return _fromatter
-
-
-
-class Worker(Thread):
-
- def __init__(self, timeout=0.2, debug=False, logger=None):
- self.timeout = timeout
- self.debug = debug
- self.logger = logger
-
- if self.debug and self.logger is None:
- self.logger = logging.getLogger(LOGGER_NAME)
- self.logger.setLevel(logging.DEBUG)
- handler = logging.StreamHandler(sys.stdout)
- self.logger.addHandler(handler)
-
- self._inbox = Queue()
- self._stop_request = Queue()
- Thread.__init__(self)
-
-
- def debug_message(self, message):
- self.logger.debug(message)
-
- def _stop_requested(self):
-
- try:
- message = self._stop_request.get_nowait()
- if message == "stop":
- return True
-
- except Empty:
- return False
-
- def stop(self):
- self._stop_request.put("stop")
- self.join()
-
- def send(self, message):
- self._inbox.put(message)
-
- def _get_message(self):
- try:
- message = self._inbox.get(timeout=self.timeout)
- return message
- except Empty:
- return None
-
-
-class TokenizerWorker(Worker):
-
- END_OF_PROCESSING = "END_OF_PROCESSING"
-
- def __init__(self, ads, tokenizer, analysis_window, observers):
- self.ads = ads
- self.tokenizer = tokenizer
- self.analysis_window = analysis_window
- self.observers = observers
- self._inbox = Queue()
- self.count = 0
- Worker.__init__(self)
-
- def run(self):
-
- def notify_observers(data, start, end):
- audio_data = b''.join(data)
- self.count += 1
-
- start_time = start * self.analysis_window
- end_time = (end+1) * self.analysis_window
- duration = (end - start + 1) * self.analysis_window
-
- # notify observers
- for observer in self.observers:
- observer.notify({"id" : self.count,
- "audio_data" : audio_data,
- "start" : start,
- "end" : end,
- "start_time" : start_time,
- "end_time" : end_time,
- "duration" : duration}
- )
-
- self.ads.open()
- self.tokenizer.tokenize(data_source=self, callback=notify_observers)
- for observer in self.observers:
- observer.notify(TokenizerWorker.END_OF_PROCESSING)
-
- def add_observer(self, observer):
- self.observers.append(observer)
-
- def remove_observer(self, observer):
- self.observers.remove(observer)
-
- def read(self):
- if self._stop_requested():
- return None
- else:
- return self.ads.read()
-
-
-class PlayerWorker(Worker):
-
- def __init__(self, player, timeout=0.2, debug=False, logger=None):
- self.player = player
- Worker.__init__(self, timeout=timeout, debug=debug, logger=logger)
-
- def run(self):
- while True:
- if self._stop_requested():
- break
-
- message = self._get_message()
- if message is not None:
- if message == TokenizerWorker.END_OF_PROCESSING:
- break
-
- audio_data = message.pop("audio_data", None)
- start_time = message.pop("start_time", None)
- end_time = message.pop("end_time", None)
- dur = message.pop("duration", None)
- _id = message.pop("id", None)
-
- if audio_data is not None:
- if self.debug:
- self.debug_message("[PLAY]: Detection {id} played (start:{start}, end:{end}, dur:{dur})".format(id=_id,
- start="{:5.2f}".format(start_time), end="{:5.2f}".format(end_time), dur="{:5.2f}".format(dur)))
- self.player.play(audio_data)
-
- def notify(self, message):
- self.send(message)
-
-
-class CommandLineWorker(Worker):
-
- def __init__(self, command, timeout=0.2, debug=False, logger=None):
- self.command = command
- Worker.__init__(self, timeout=timeout, debug=debug, logger=logger)
-
- def run(self):
- while True:
- if self._stop_requested():
- break
-
- message = self._get_message()
- if message is not None:
- if message == TokenizerWorker.END_OF_PROCESSING:
- break
-
- audio_data = message.pop("audio_data", None)
- _id = message.pop("id", None)
- if audio_data is not None:
- raw_audio_file = tempfile.NamedTemporaryFile(delete=False)
- raw_audio_file.write(audio_data)
- cmd = self.command.replace("$", raw_audio_file.name)
- if self.debug:
- self.debug_message("[CMD ]: Detection {id} command: {cmd}".format(id=_id, cmd=cmd))
- os.system(cmd)
- os.unlink(raw_audio_file.name)
-
- def notify(self, message):
- self.send(message)
-
-
-class TokenSaverWorker(Worker):
-
- def __init__(self, name_format, filetype, timeout=0.2, debug=False, logger=None, **kwargs):
- self.name_format = name_format
- self.filetype = filetype
- self.kwargs = kwargs
- Worker.__init__(self, timeout=timeout, debug=debug, logger=logger)
-
- def run(self):
- while True:
- if self._stop_requested():
- break
-
- message = self._get_message()
- if message is not None:
- if message == TokenizerWorker.END_OF_PROCESSING:
- break
-
- audio_data = message.pop("audio_data", None)
- start_time = message.pop("start_time", None)
- end_time = message.pop("end_time", None)
- _id = message.pop("id", None)
- if audio_data is not None and len(audio_data) > 0:
- fname = self.name_format.format(N=_id, start = "{:.2f}".format(start_time), end = "{:.2f}".format(end_time))
- try:
- if self.debug:
- self.debug_message("[SAVE]: Detection {id} saved as {fname}".format(id=_id, fname=fname))
- save_audio_data(audio_data, fname, filetype=self.filetype, **self.kwargs)
- except Exception as e:
- sys.stderr.write(str(e) + "\n")
-
- def notify(self, message):
- self.send(message)
-
-
-class LogWorker(Worker):
-
- def __init__(self, print_detections=False, output_format="{start} {end}",
- time_formatter=seconds_to_str_fromatter("%S"), timeout=0.2, debug=False, logger=None):
-
- self.print_detections = print_detections
- self.output_format = output_format
- self.time_formatter = time_formatter
- self.detections = []
- Worker.__init__(self, timeout=timeout, debug=debug, logger=logger)
-
- def run(self):
- while True:
- if self._stop_requested():
- break
-
- message = self._get_message()
-
- if message is not None:
-
- if message == TokenizerWorker.END_OF_PROCESSING:
- break
-
- audio_data = message.pop("audio_data", None)
- _id = message.pop("id", None)
- start = message.pop("start", None)
- end = message.pop("end", None)
- start_time = message.pop("start_time", None)
- end_time = message.pop("end_time", None)
- if audio_data is not None and len(audio_data) > 0:
-
- if self.debug:
- self.debug_message("[DET ]: Detection {id} (start:{start}, end:{end})".format(id=_id,
- start="{:5.2f}".format(start_time),
- end="{:5.2f}".format(end_time)))
-
- if self.print_detections:
- print(self.output_format.format(id = _id,
- start = self.time_formatter(start_time),
- end = self.time_formatter(end_time)))
-
- self.detections.append((_id, start, end, start_time, end_time))
-
-
- def notify(self, message):
- self.send(message)
-
+__date__ = "2015-11-23"
+__updated__ = "2021-03-01"
def main(argv=None):
- '''Command line options.'''
-
program_name = os.path.basename(sys.argv[0])
- program_version = version
- program_build_date = "%s" % __updated__
-
- program_version_string = '%%prog %s (%s)' % (program_version, program_build_date)
- #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse
- program_longdesc = '''''' # optional - give further explanation about what the program does
- program_license = "Copyright 2015 Mohamed El Amine SEHILI \
- Licensed under the General Public License (GPL) Version 3 \nhttp://www.gnu.org/licenses/"
-
if argv is None:
argv = sys.argv[1:]
try:
- # setup option parser
- parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license)
-
- group = OptionGroup(parser, "[Input-Output options]")
- group.add_option("-i", "--input", dest="input", help="Input audio or video file. Use - for stdin [default: read from microphone using pyaudio]", metavar="FILE")
- group.add_option("-t", "--input-type", dest="input_type", help="Input audio file type. Mandatory if file name has no extension [default: %default]", type=str, default=None, metavar="String")
- group.add_option("-M", "--max_time", dest="max_time", help="Max data (in seconds) to read from microphone/file [default: read until the end of file/stream]", type=float, default=None, metavar="FLOAT")
- group.add_option("-O", "--output-main", dest="output_main", help="Save main stream as. If omitted main stream will not be saved [default: omitted]", type=str, default=None, metavar="FILE")
- group.add_option("-o", "--output-tokens", dest="output_tokens", help="Output file name format for detections. Use {N} and {start} and {end} to build file names, example: 'Det_{N}_{start}-{end}.wav'", type=str, default=None, metavar="STRING")
- group.add_option("-T", "--output-type", dest="output_type", help="Audio type used to save detections and/or main stream. If not supplied will: (1). guess from extension or (2). use wav format", type=str, default=None, metavar="STRING")
- group.add_option("-u", "--use-channel", dest="use_channel", help="Choose channel to use from a multi-channel audio file (requires pydub). 'left', 'right' and 'mix' are accepted values. [Default: 1 (i.e. 1st or left channel)]", type=str, default="1", metavar="STRING")
- parser.add_option_group(group)
-
-
- group = OptionGroup(parser, "[Tokenization options]", "Set tokenizer options and energy threshold.")
- group.add_option("-a", "--analysis-window", dest="analysis_window", help="Size of analysis window in seconds [default: %default (10ms)]", type=float, default=0.01, metavar="FLOAT")
- group.add_option("-n", "--min-duration", dest="min_duration", help="Min duration of a valid audio event in seconds [default: %default]", type=float, default=0.2, metavar="FLOAT")
- group.add_option("-m", "--max-duration", dest="max_duration", help="Max duration of a valid audio event in seconds [default: %default]", type=float, default=5, metavar="FLOAT")
- group.add_option("-s", "--max-silence", dest="max_silence", help="Max duration of a consecutive silence within a valid audio event in seconds [default: %default]", type=float, default=0.3, metavar="FLOAT")
- group.add_option("-d", "--drop-trailing-silence", dest="drop_trailing_silence", help="Drop trailing silence from a detection [default: keep trailing silence]", action="store_true", default=False)
- group.add_option("-e", "--energy-threshold", dest="energy_threshold", help="Log energy threshold for detection [default: %default]", type=float, default=50, metavar="FLOAT")
- parser.add_option_group(group)
-
-
- group = OptionGroup(parser, "[Audio parameters]", "Define audio parameters if data is read from a headerless file (raw or stdin) or you want to use different microphone parameters.")
- group.add_option("-r", "--rate", dest="sampling_rate", help="Sampling rate of audio data [default: %default]", type=int, default=16000, metavar="INT")
- group.add_option("-c", "--channels", dest="channels", help="Number of channels of audio data [default: %default]", type=int, default=1, metavar="INT")
- group.add_option("-w", "--width", dest="sample_width", help="Number of bytes per audio sample [default: %default]", type=int, default=2, metavar="INT")
- parser.add_option_group(group)
-
- group = OptionGroup(parser, "[Do something with detections]", "Use these options to print, play or plot detections.")
- group.add_option("-C", "--command", dest="command", help="Command to call when an audio detection occurs. Use $ to represent the file name to use with the command (e.g. -C 'du -h $')", default=None, type=str, metavar="STRING")
- group.add_option("-E", "--echo", dest="echo", help="Play back each detection immediately using pyaudio [default: do not play]", action="store_true", default=False)
- group.add_option("-p", "--plot", dest="plot", help="Plot and show audio signal and detections (requires matplotlib)", action="store_true", default=False)
- group.add_option("", "--save-image", dest="save_image", help="Save plotted audio signal and detections as a picture or a PDF file (requires matplotlib)", type=str, default=None, metavar="FILE")
- group.add_option("", "--printf", dest="printf", help="print detections one per line using a user supplied format (e.g. '[{id}]: {start} -- {end}'). Available keywords {id}, {start} and {end}", type=str, default="{id} {start} {end}", metavar="STRING")
- group.add_option("", "--time-format", dest="time_format", help="format used to print {start} and {end}. [Default= %default]. %S: absolute time in sec. %I: absolute time in ms. If at least one of (%h, %m, %s, %i) is used, convert time into hours, minutes, seconds and millis (e.g. %h:%m:%s.%i). Only required fields are printed", type=str, default="%S", metavar="STRING")
- parser.add_option_group(group)
-
- parser.add_option("-q", "--quiet", dest="quiet", help="Do not print any information about detections [default: print 'id', 'start' and 'end' of each detection]", action="store_true", default=False)
- parser.add_option("-D", "--debug", dest="debug", help="Print processing operations to STDOUT", action="store_true", default=False)
- parser.add_option("", "--debug-file", dest="debug_file", help="Print processing operations to FILE", type=str, default=None, metavar="FILE")
-
-
+ parser = ArgumentParser(
+ prog=program_name, description="An Audio Tokenization tool"
+ )
+ parser.add_argument(
+ "--version", "-v", action="version", version=__version__
+ )
+ group = parser.add_argument_group("Input-Output options")
+ group.add_argument(
+ dest="input",
+ help="Input audio or video file. Use '-' for stdin "
+ "[default: read from microphone using pyaudio]",
+ metavar="input",
+ nargs="?",
+ default=None,
+ )
+ group.add_argument(
+ "-I",
+ "--input-device-index",
+ dest="input_device_index",
+ help="Audio device index [default: %(default)s]. "
+ "Optional and only effective when using PyAudio",
+ type=int,
+ default=None,
+ metavar="INT",
+ )
+ group.add_argument(
+ "-F",
+ "--audio-frame-per-buffer",
+ dest="frame_per_buffer",
+ help="Audio frame per buffer [default: %(default)s]. "
+ "Optional and only effective when using PyAudio",
+ type=int,
+ default=1024,
+ metavar="INT",
+ )
+ group.add_argument(
+ "-f",
+ "--input-format",
+ dest="input_format",
+ type=str,
+ default=None,
+ help="Input audio file format. If not given, guess format from "
+ "extension. If output file name has no extension, guess format "
+ "from file header (requires pydub). If none of the previous is "
+ "true, raise an error",
+ metavar="STRING",
+ )
+ group.add_argument(
+ "-M",
+ "--max-read",
+ dest="max_read",
+ type=float,
+ default=None,
+ help="Maximum data (in seconds) to read from microphone or file "
+ "[default: read until the end of file/stream]",
+ metavar="FLOAT",
+ )
+ group.add_argument(
+ "-L",
+ "--large-file",
+ dest="large_file",
+ action="store_true",
+ default=False,
+ help="Whether input file should be treated as a large file. "
+ "If True, data will be read from file on demand, otherwise all "
+ "audio data is loaded to memory before tokenization.",
+ )
+ group.add_argument(
+ "-O",
+ "--save-stream",
+ dest="save_stream",
+ type=str,
+ default=None,
+ help="Save acquired audio data (from file or microphone) to disk."
+ " If omitted no data will be saved. [default: omitted]",
+ metavar="FILE",
+ )
+ group.add_argument(
+ "-o",
+ "--save-detections-as",
+ dest="save_detections_as",
+ type=str,
+ default=None,
+ help="File name format for detections."
+ "The following placeholders can be used to build output file name "
+ "for each detection: {id} (sequential, starts from 1), {start}, "
+ "{end} and {duration}. Time placeholders are in seconds. "
+ "Example: 'Event_{id}_{start}-{end}_{duration:.3f}.wav'",
+ metavar="STRING",
+ )
+ group.add_argument(
+ "-T",
+ "--output-format",
+ dest="output_format",
+ type=str,
+ default=None,
+ help="Audio format used to save detections and/or main stream. "
+ "If not supplied, then it will: (1. be guessed from extension or "
+ "(2. use raw format",
+ metavar="STRING",
+ )
+ group.add_argument(
+ "-u",
+ "--use-channel",
+ dest="use_channel",
+ type=str,
+ default=None,
+ help="Which channel to use for tokenization when input stream is "
+ "multi-channel (0 is the first channel). Default is None, meaning "
+ "that all channels will be considered for tokenization (i.e., get "
+ "any valid audio event regardless of the channel it occurs in). "
+ "This value can also be 'mix' (alias 'avg' or 'average') and "
+ "means mix down all audio channels into one channel (i.e. compute "
+ "average channel) and use the resulting channel for tokenization. "
+ "Whatever option is used, saved audio events will contain the same"
+ " number of channels as input stream. "
+ "[Default: None, use all channels]",
+ metavar="INT/STRING",
+ )
+
+ group = parser.add_argument_group(
+ "Tokenization options", "Set tokenizer options."
+ )
+ group.add_argument(
+ "-a",
+ "--analysis-window",
+ dest="analysis_window",
+ default=0.01,
+ type=float,
+ help="Size of analysis window in seconds [default: %(default)s "
+ "(10ms)]",
+ metavar="FLOAT",
+ )
+ group.add_argument(
+ "-n",
+ "--min-duration",
+ dest="min_duration",
+ type=float,
+ default=0.2,
+ help="Min duration of a valid audio event in seconds "
+ "[default: %(default)s]",
+ metavar="FLOAT",
+ )
+ group.add_argument(
+ "-m",
+ "--max-duration",
+ dest="max_duration",
+ type=float,
+ default=5,
+ help="Max duration of a valid audio event in seconds "
+ "[default: %(default)s]",
+ metavar="FLOAT",
+ )
+ group.add_argument(
+ "-s",
+ "--max-silence",
+ dest="max_silence",
+ type=float,
+ default=0.3,
+ help="Max duration of a consecutive silence within a valid audio "
+ "event in seconds [default: %(default)s]",
+ metavar="FLOAT",
+ )
+ group.add_argument(
+ "-d",
+ "--drop-trailing-silence",
+ dest="drop_trailing_silence",
+ action="store_true",
+ default=False,
+ help="Drop trailing silence from a detection [default: keep "
+ "trailing silence]",
+ )
+ group.add_argument(
+ "-R",
+ "--strict-min-duration",
+ dest="strict_min_duration",
+ action="store_true",
+ default=False,
+ help="Reject an event shorter than --min-duration even if it's "
+ "adjacent to the latest valid event that reached max-duration "
+ "[default: keep such events]",
+ )
+ group.add_argument(
+ "-e",
+ "--energy-threshold",
+ dest="energy_threshold",
+ type=float,
+ default=50,
+ help="Log energy threshold for detection [default: %(default)s]",
+ metavar="FLOAT",
+ )
+
+ group = parser.add_argument_group(
+ "Audio parameters",
+ "Define audio parameters if data is read from a "
+ "headerless file (raw or stdin) or you want to use "
+ "different microphone parameters.",
+ )
+ group.add_argument(
+ "-r",
+ "--rate",
+ dest="sampling_rate",
+ type=int,
+ default=16000,
+ help="Sampling rate of audio data [default: %(default)s]",
+ metavar="INT",
+ )
+ group.add_argument(
+ "-c",
+ "--channels",
+ dest="channels",
+ type=int,
+ default=1,
+ help="Number of channels of audio data [default: %(default)s]",
+ metavar="INT",
+ )
+ group.add_argument(
+ "-w",
+ "--width",
+ dest="sample_width",
+ type=int,
+ default=2,
+ help="Number of bytes per audio sample [default: %(default)s]",
+ metavar="INT",
+ )
+
+ group = parser.add_argument_group(
+ "Do something with audio events",
+ "Use these options to print, play back or plot detections.",
+ )
+ group.add_argument(
+ "-C",
+ "--command",
+ dest="command",
+ type=str,
+ help="Command to call when an audio detection occurs. Use '{file}' "
+ "as a placeholder for the temporary wav file that will contain "
+ "event's data (e.g., \"-C 'du -h {file}'\" to print out file size "
+ " or \"-C 'play -q {file}'\" to play audio with sox)",
+ metavar="STRING",
+ )
+ group.add_argument(
+ "-E",
+ "--echo",
+ dest="echo",
+ action="store_true",
+ default=False,
+ help="Play back each detection immediately using pyaudio",
+ )
+ group.add_argument(
+ "-B",
+ "--progress-bar",
+ dest="progress_bar",
+ action="store_true",
+ default=False,
+ help="Show a progress bar when playing audio",
+ )
+ group.add_argument(
+ "-p",
+ "--plot",
+ dest="plot",
+ action="store_true",
+ default=False,
+ help="Plot and show audio signal and detections (requires "
+ "matplotlib)",
+ )
+ group.add_argument(
+ "--save-image",
+ dest="save_image",
+ type=str,
+ help="Save plotted audio signal and detections as a picture or a "
+ "PDF file (requires matplotlib)",
+ metavar="FILE",
+ )
+ group.add_argument(
+ "--printf",
+ dest="printf",
+ type=str,
+ default="{id} {start} {end}",
+ help="Print audio events information, one per line, using this "
+ "format. Format can contain text with the following placeholders: "
+ "{id} (sequential, starts from 1), {start}, {end}, {duration} and "
+ "{timestamp}. The first 3 time placeholders are in seconds and "
+ "their format can be set using --time-format argument. "
+ "{timestamp} is the system timestamp (date and time) of the event "
+ "and can be set using --timestamp-format argument.\n"
+ "Example: '[{id}]: {start} -> {end} -- {timestamp}'",
+ metavar="STRING",
+ )
+ group.add_argument(
+ "--time-format",
+ dest="time_format",
+ type=str,
+ default="%S",
+ help="Format used to print {start}, {end} and {duration} "
+ "placeholders used with --printf [default= %(default)s]. The "
+ "following formats are accepted:\n"
+ "%%S: absolute time in seconds. %%I: absolute time in ms. If at "
+ "least one of (%%h, %%m, %%s, %%i) is used, convert time into "
+ "hours, minutes, seconds and millis (e.g. %%h:%%m:%%s.%%i). Only "
+ "supplied fields are printed. Note that %%S and %%I can only be "
+ "used alone",
+ metavar="STRING",
+ )
+ group.add_argument(
+ "--timestamp-format",
+ dest="timestamp_format",
+ type=str,
+ default="%Y/%m/%d %H:%M:%S",
+ help="Format used to print {timestamp}. Should be a format "
+ "accepted by 'datetime' standard module. Default: "
+ "'%%Y/%%m/%%d %%H:%%M:%%S'",
+ )
+ parser.add_argument(
+ "-q",
+ "--quiet",
+ dest="quiet",
+ action="store_true",
+ default=False,
+ help="Do not print any information about detections [default: "
+ "print 'id', 'start' and 'end' of each detection]",
+ )
+ parser.add_argument(
+ "-D",
+ "--debug",
+ dest="debug",
+ action="store_true",
+ default=False,
+ help="Print processing operations to STDOUT",
+ )
+ parser.add_argument(
+ "--debug-file",
+ dest="debug_file",
+ type=str,
+ default=None,
+ help="Print processing operations to FILE",
+ metavar="FILE",
+ )
+
+ args = parser.parse_args(argv)
+ logger = make_logger(args.debug, args.debug_file)
+ kwargs = make_kwargs(args)
+ reader, observers = initialize_workers(
+ logger=logger, **kwargs.io, **kwargs.miscellaneous
+ )
+ tokenizer_worker = workers.TokenizerWorker(
+ reader, observers, logger=logger, **kwargs.split
+ )
+ tokenizer_worker.start_all()
- # process options
- (opts, args) = parser.parse_args(argv)
-
- if opts.input == "-":
- asource = StdinAudioSource(sampling_rate = opts.sampling_rate,
- sample_width = opts.sample_width,
- channels = opts.channels)
- #read data from a file
- elif opts.input is not None:
- asource = file_to_audio_source(filename=opts.input, filetype=opts.input_type, uc=opts.use_channel)
-
- # read data from microphone via pyaudio
- else:
- try:
- asource = PyAudioSource(sampling_rate = opts.sampling_rate,
- sample_width = opts.sample_width,
- channels = opts.channels)
- except Exception:
- sys.stderr.write("Cannot read data from audio device!\n")
- sys.stderr.write("You should either install pyaudio or read data from STDIN\n")
- sys.exit(2)
-
- logger = logging.getLogger(LOGGER_NAME)
- logger.setLevel(logging.DEBUG)
-
- handler = logging.StreamHandler(sys.stdout)
- if opts.quiet or not opts.debug:
- # only critical messages will be printed
- handler.setLevel(logging.CRITICAL)
- else:
- handler.setLevel(logging.DEBUG)
-
- logger.addHandler(handler)
-
- if opts.debug_file is not None:
- logger.setLevel(logging.DEBUG)
- opts.debug = True
- handler = logging.FileHandler(opts.debug_file, "w")
- fmt = logging.Formatter('[%(asctime)s] | %(message)s')
- handler.setFormatter(fmt)
- handler.setLevel(logging.DEBUG)
- logger.addHandler(handler)
-
- record = opts.output_main is not None or opts.plot or opts.save_image is not None
-
- ads = ADSFactory.ads(audio_source = asource, block_dur = opts.analysis_window, max_time = opts.max_time, record = record)
- validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=opts.energy_threshold)
-
-
- if opts.drop_trailing_silence:
- mode = StreamTokenizer.DROP_TRAILING_SILENCE
- else:
- mode = 0
-
- analysis_window_per_second = 1. / opts.analysis_window
- tokenizer = StreamTokenizer(validator=validator, min_length=opts.min_duration * analysis_window_per_second,
- max_length=int(opts.max_duration * analysis_window_per_second),
- max_continuous_silence=opts.max_silence * analysis_window_per_second,
- mode = mode)
-
-
- observers = []
- tokenizer_worker = None
-
- if opts.output_tokens is not None:
-
- try:
- # check user format is correct
- fname = opts.output_tokens.format(N=0, start=0, end=0)
-
- # find file type for detections
- tok_type = opts.output_type
- if tok_type is None:
- tok_type = os.path.splitext(opts.output_tokens)[1][1:]
- if tok_type == "":
- tok_type = "wav"
-
- token_saver = TokenSaverWorker(name_format=opts.output_tokens, filetype=tok_type,
- debug=opts.debug, logger=logger, sr=asource.get_sampling_rate(),
- sw=asource.get_sample_width(),
- ch=asource.get_channels())
- observers.append(token_saver)
-
- except Exception:
- sys.stderr.write("Wrong format for detections file name: '{0}'\n".format(opts.output_tokens))
- sys.exit(2)
-
- if opts.echo:
- try:
- player = player_for(asource)
- player_worker = PlayerWorker(player=player, debug=opts.debug, logger=logger)
- observers.append(player_worker)
- except Exception:
- sys.stderr.write("Cannot get an audio player!\n")
- sys.stderr.write("You should either install pyaudio or supply a command (-C option) to play audio\n")
- sys.exit(2)
-
- if opts.command is not None and len(opts.command) > 0:
- cmd_worker = CommandLineWorker(command=opts.command, debug=opts.debug, logger=logger)
- observers.append(cmd_worker)
-
- if not opts.quiet or opts.plot is not None or opts.save_image is not None:
- oformat = opts.printf.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "\r")
- converter = seconds_to_str_fromatter(opts.time_format)
- log_worker = LogWorker(print_detections = not opts.quiet, output_format=oformat,
- time_formatter=converter, logger=logger, debug=opts.debug)
- observers.append(log_worker)
-
- tokenizer_worker = TokenizerWorker(ads, tokenizer, opts.analysis_window, observers)
-
- def _save_main_stream():
- # find file type
- main_type = opts.output_type
- if main_type is None:
- main_type = os.path.splitext(opts.output_main)[1][1:]
- if main_type == "":
- main_type = "wav"
- ads.close()
- ads.rewind()
- data = ads.get_audio_source().get_data_buffer()
- if len(data) > 0:
- save_audio_data(data=data, filename=opts.output_main, filetype=main_type, sr=asource.get_sampling_rate(),
- sw = asource.get_sample_width(),
- ch = asource.get_channels())
-
- def _plot():
- import numpy as np
- ads.close()
- ads.rewind()
- data = ads.get_audio_source().get_data_buffer()
- signal = AudioEnergyValidator._convert(data, asource.get_sample_width())
- detections = [(det[3] , det[4]) for det in log_worker.detections]
- max_amplitude = 2**(asource.get_sample_width() * 8 - 1) - 1
- energy_as_amp = np.sqrt(np.exp(opts.energy_threshold * np.log(10) / 10)) / max_amplitude
- plot_all(signal / max_amplitude, asource.get_sampling_rate(), energy_as_amp, detections, show = opts.plot, save_as = opts.save_image)
-
-
- # start observer threads
- for obs in observers:
- obs.start()
- # start tokenization thread
- tokenizer_worker.start()
-
while True:
time.sleep(1)
if len(threading.enumerate()) == 1:
- break
-
- tokenizer_worker = None
-
- if opts.output_main is not None:
- _save_main_stream()
- if opts.plot or opts.save_image is not None:
- _plot()
-
- return 0
-
- except KeyboardInterrupt:
-
+ raise EndOfProcessing
+
+ except (KeyboardInterrupt, EndOfProcessing):
if tokenizer_worker is not None:
- tokenizer_worker.stop()
- for obs in observers:
- obs.stop()
-
- if opts.output_main is not None:
- _save_main_stream()
- if opts.plot or opts.save_image is not None:
- _plot()
-
+ tokenizer_worker.stop_all()
+
+ if isinstance(reader, workers.StreamSaverWorker):
+ reader.join()
+ try:
+ reader.save_stream()
+ except AudioEncodingWarning as ae_warn:
+ print(str(ae_warn), file=sys.stderr)
+
+ if args.plot or args.save_image is not None:
+ from .plotting import plot
+
+ reader.rewind()
+ record = AudioRegion(
+ reader.data, reader.sr, reader.sw, reader.ch
+ )
+ detections = (
+ (det.start, det.end) for det in tokenizer_worker.detections
+ )
+ plot(
+ record,
+ detections=detections,
+ energy_threshold=args.energy_threshold,
+ show=True,
+ save_as=args.save_image,
+ )
return 0
- except Exception as e:
- sys.stderr.write(program_name + ": " + str(e) + "\n")
- sys.stderr.write("for help use -h\n")
-
- return 2
if __name__ == "__main__":
- if DEBUG:
- sys.argv.append("-h")
- if TESTRUN:
- import doctest
- doctest.testmod()
- if PROFILE:
- import cProfile
- import pstats
- profile_filename = 'auditok.auditok_profile.txt'
- cProfile.run('main()', profile_filename)
- statsfile = open("profile_stats.txt", "wb")
- p = pstats.Stats(profile_filename, stream=statsfile)
- stats = p.strip_dirs().sort_stats('cumulative')
- stats.print_stats()
- statsfile.close()
- sys.exit(0)
- sys.exit(main())
+ sys.exit(main(None))
diff --git a/libs/auditok/cmdline_util.py b/libs/auditok/cmdline_util.py
new file mode 100755
index 000000000..bde72aa36
--- /dev/null
+++ b/libs/auditok/cmdline_util.py
@@ -0,0 +1,126 @@
+import sys
+import logging
+from collections import namedtuple
+from . import workers
+from .util import AudioDataSource
+from .io import player_for
+
+_AUDITOK_LOGGER = "AUDITOK_LOGGER"
+KeywordArguments = namedtuple(
+ "KeywordArguments", ["io", "split", "miscellaneous"]
+)
+
+
+def make_kwargs(args_ns):
+ if args_ns.save_stream is None:
+ record = args_ns.plot or (args_ns.save_image is not None)
+ else:
+ record = False
+ try:
+ use_channel = int(args_ns.use_channel)
+ except (ValueError, TypeError):
+ use_channel = args_ns.use_channel
+
+ io_kwargs = {
+ "input": args_ns.input,
+ "audio_format": args_ns.input_format,
+ "max_read": args_ns.max_read,
+ "block_dur": args_ns.analysis_window,
+ "sampling_rate": args_ns.sampling_rate,
+ "sample_width": args_ns.sample_width,
+ "channels": args_ns.channels,
+ "use_channel": use_channel,
+ "save_stream": args_ns.save_stream,
+ "save_detections_as": args_ns.save_detections_as,
+ "export_format": args_ns.output_format,
+ "large_file": args_ns.large_file,
+ "frames_per_buffer": args_ns.frame_per_buffer,
+ "input_device_index": args_ns.input_device_index,
+ "record": record,
+ }
+
+ split_kwargs = {
+ "min_dur": args_ns.min_duration,
+ "max_dur": args_ns.max_duration,
+ "max_silence": args_ns.max_silence,
+ "drop_trailing_silence": args_ns.drop_trailing_silence,
+ "strict_min_dur": args_ns.strict_min_duration,
+ "energy_threshold": args_ns.energy_threshold,
+ }
+
+ miscellaneous = {
+ "echo": args_ns.echo,
+ "progress_bar": args_ns.progress_bar,
+ "command": args_ns.command,
+ "quiet": args_ns.quiet,
+ "printf": args_ns.printf,
+ "time_format": args_ns.time_format,
+ "timestamp_format": args_ns.timestamp_format,
+ }
+ return KeywordArguments(io_kwargs, split_kwargs, miscellaneous)
+
+
+def make_logger(stderr=False, file=None, name=_AUDITOK_LOGGER):
+ if not stderr and file is None:
+ return None
+ logger = logging.getLogger(name)
+ logger.setLevel(logging.INFO)
+ if stderr:
+ handler = logging.StreamHandler(sys.stderr)
+ handler.setLevel(logging.INFO)
+ logger.addHandler(handler)
+
+ if file is not None:
+ handler = logging.FileHandler(file, "w")
+ fmt = logging.Formatter("[%(asctime)s] | %(message)s")
+ handler.setFormatter(fmt)
+ handler.setLevel(logging.INFO)
+ logger.addHandler(handler)
+ return logger
+
+
+def initialize_workers(logger=None, **kwargs):
+ observers = []
+ reader = AudioDataSource(source=kwargs["input"], **kwargs)
+ if kwargs["save_stream"] is not None:
+ reader = workers.StreamSaverWorker(
+ reader,
+ filename=kwargs["save_stream"],
+ export_format=kwargs["export_format"],
+ )
+ reader.start()
+
+ if kwargs["save_detections_as"] is not None:
+ worker = workers.RegionSaverWorker(
+ kwargs["save_detections_as"],
+ kwargs["export_format"],
+ logger=logger,
+ )
+ observers.append(worker)
+
+ if kwargs["echo"]:
+ player = player_for(reader)
+ worker = workers.PlayerWorker(
+ player, progress_bar=kwargs["progress_bar"], logger=logger
+ )
+ observers.append(worker)
+
+ if kwargs["command"] is not None:
+ worker = workers.CommandLineWorker(
+ command=kwargs["command"], logger=logger
+ )
+ observers.append(worker)
+
+ if not kwargs["quiet"]:
+ print_format = (
+ kwargs["printf"]
+ .replace("\\n", "\n")
+ .replace("\\t", "\t")
+ .replace("\\r", "\r")
+ )
+ worker = workers.PrintWorker(
+ print_format, kwargs["time_format"], kwargs["timestamp_format"]
+ )
+ observers.append(worker)
+
+ return reader, observers
diff --git a/libs/auditok/core.py b/libs/auditok/core.py
index 47441d2b7..af00dc7af 100644
--- a/libs/auditok/core.py
+++ b/libs/auditok/core.py
@@ -1,264 +1,1267 @@
"""
-This module gathers processing (i.e. tokenization) classes.
-
-Class summary
-=============
-
.. autosummary::
+ :toctree: generated/
- StreamTokenizer
+ load
+ split
+ AudioRegion
+ StreamTokenizer
"""
+import os
+import math
+from .util import AudioReader, DataValidator, AudioEnergyValidator
+from .io import check_audio_data, to_file, player_for, get_audio_source
+from .exceptions import TooSamllBlockDuration
+
+try:
+ from . import signal_numpy as signal
+except ImportError:
+ from . import signal
+
+__all__ = ["load", "split", "AudioRegion", "StreamTokenizer"]
+
+
+DEFAULT_ANALYSIS_WINDOW = 0.05
+DEFAULT_ENERGY_THRESHOLD = 50
+_EPSILON = 1e-10
+
+
+def load(input, skip=0, max_read=None, **kwargs):
+ """Load audio data from a source and return it as an :class:`AudioRegion`.
+
+ Parameters
+ ----------
+ input : None, str, bytes, AudioSource
+ source to read audio data from. If `str`, it should be a path to a
+ valid audio file. If `bytes`, it is used as raw audio data. If it is
+ "-", raw data will be read from stdin. If None, read audio data from
+ the microphone using PyAudio. If of type `bytes` or is a path to a
+ raw audio file then `sampling_rate`, `sample_width` and `channels`
+ parameters (or their alias) are required. If it's an
+ :class:`AudioSource` object it's used directly to read data.
+ skip : float, default: 0
+ amount, in seconds, of audio data to skip from source. If read from
+ a microphone, `skip` must be 0, otherwise a `ValueError` is raised.
+ max_read : float, default: None
+ amount, in seconds, of audio data to read from source. If read from
+ microphone, `max_read` should not be None, otherwise a `ValueError` is
+ raised.
+ audio_format, fmt : str
+ type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only
+ be used if `input` is a string path to an audio file. If not given,
+ audio type will be guessed from file name extension or from file
+ header.
+ sampling_rate, sr : int
+ sampling rate of audio data. Required if `input` is a raw audio file,
+ a `bytes` object or None (i.e., read from microphone).
+ sample_width, sw : int
+ number of bytes used to encode one audio sample, typically 1, 2 or 4.
+ Required for raw data, see `sampling_rate`.
+ channels, ch : int
+ number of channels of audio data. Required for raw data, see
+ `sampling_rate`.
+ large_file : bool, default: False
+ If True, AND if `input` is a path to a *wav* of a *raw* audio file
+ (and **only** these two formats) then audio file is not fully loaded to
+ memory in order to create the region (but the portion of data needed to
+ create the region is of course loaded to memory). Set to True if
+ `max_read` is significantly smaller then the size of a large audio file
+ that shouldn't be entirely loaded to memory.
+
+ Returns
+ -------
+ region: AudioRegion
+
+ Raises
+ ------
+ ValueError
+ raised if `input` is None (i.e., read data from microphone) and `skip`
+ != 0 or `input` is None `max_read` is None (meaning that when reading
+ from the microphone, no data should be skipped, and maximum amount of
+ data to read should be explicitly provided).
+ """
+ return AudioRegion.load(input, skip, max_read, **kwargs)
+
+
+def split(
+ input,
+ min_dur=0.2,
+ max_dur=5,
+ max_silence=0.3,
+ drop_trailing_silence=False,
+ strict_min_dur=False,
+ **kwargs
+):
+ """
+ Split audio data and return a generator of AudioRegions
+
+ Parameters
+ ----------
+ input : str, bytes, AudioSource, AudioReader, AudioRegion or None
+ input audio data. If str, it should be a path to an existing audio file.
+ "-" is interpreted as standard input. If bytes, input is considered as
+ raw audio data. If None, read audio from microphone.
+ Every object that is not an `AudioReader` will be transformed into an
+ `AudioReader` before processing. If it is an `str` that refers to a raw
+ audio file, `bytes` or None, audio parameters should be provided using
+ kwargs (i.e., `samplig_rate`, `sample_width` and `channels` or their
+ alias).
+ If `input` is str then audio format will be guessed from file extension.
+ `audio_format` (alias `fmt`) kwarg can also be given to specify audio
+ format explicitly. If none of these options is available, rely on
+ backend (currently only pydub is supported) to load data.
+ min_dur : float, default: 0.2
+ minimun duration in seconds of a detected audio event. By using large
+ values for `min_dur`, very short audio events (e.g., very short 1-word
+ utterances like 'yes' or 'no') can be mis detected. Using very short
+ values might result in a high number of short, unuseful audio events.
+ max_dur : float, default: 5
+ maximum duration in seconds of a detected audio event. If an audio event
+ lasts more than `max_dur` it will be truncated. If the continuation of a
+ truncated audio event is shorter than `min_dur` then this continuation
+ is accepted as a valid audio event if `strict_min_dur` is False.
+ Otherwise it is rejected.
+ max_silence : float, default: 0.3
+ maximum duration of continuous silence within an audio event. There
+ might be many silent gaps of this duration within one audio event. If
+ the continuous silence happens at the end of the event than it's kept as
+ part of the event if `drop_trailing_silence` is False (default).
+ drop_trailing_silence : bool, default: False
+ Whether to remove trailing silence from detected events. To avoid abrupt
+ cuts in speech, trailing silence should be kept, therefore this
+ parameter should be False.
+ strict_min_dur : bool, default: False
+ strict minimum duration. Do not accept an audio event if it is shorter
+ than `min_dur` even if it is contiguous to the latest valid event. This
+ happens if the the latest detected event had reached `max_dur`.
+
+ Other Parameters
+ ----------------
+ analysis_window, aw : float, default: 0.05 (50 ms)
+ duration of analysis window in seconds. A value between 0.01 (10 ms) and
+ 0.1 (100 ms) should be good for most use-cases.
+ audio_format, fmt : str
+ type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be
+ used if `input` is a string path to an audio file. If not given, audio
+ type will be guessed from file name extension or from file header.
+ sampling_rate, sr : int
+ sampling rate of audio data. Required if `input` is a raw audio file, is
+ a bytes object or None (i.e., read from microphone).
+ sample_width, sw : int
+ number of bytes used to encode one audio sample, typically 1, 2 or 4.
+ Required for raw data, see `sampling_rate`.
+ channels, ch : int
+ number of channels of audio data. Required for raw data, see
+ `sampling_rate`.
+ use_channel, uc : {None, "mix"} or int
+ which channel to use for split if `input` has multiple audio channels.
+ Regardless of which channel is used for splitting, returned audio events
+ contain data from *all* channels, just as `input`.
+ The following values are accepted:
+
+ - None (alias "any"): accept audio activity from any channel, even if
+ other channels are silent. This is the default behavior.
+
+ - "mix" ("avg" or "average"): mix down all channels (i.e. compute
+ average channel) and split the resulting channel.
+
+ - int (0 <=, > `channels`): use one channel, specified by integer id,
+ for split.
+
+ large_file : bool, default: False
+ If True, AND if `input` is a path to a *wav* of a *raw* audio file
+ (and only these two formats) then audio data is lazily loaded to memory
+ (i.e., one analysis window a time). Otherwise the whole file is loaded
+ to memory before split. Set to True if the size of the file is larger
+ than available memory.
+ max_read, mr : float, default: None, read until end of stream
+ maximum data to read from source in seconds.
+ validator, val : callable, DataValidator
+ custom data validator. If `None` (default), an `AudioEnergyValidor` is
+ used with the given energy threshold. Can be a callable or an instance
+ of `DataValidator` that implements `is_valid`. In either case, it'll be
+ called with with a window of audio data as the first parameter.
+ energy_threshold, eth : float, default: 50
+ energy threshold for audio activity detection. Audio regions that have
+ enough windows of with a signal energy equal to or above this threshold
+ are considered valid audio events. Here we are referring to this amount
+ as the energy of the signal but to be more accurate, it is the log
+ energy of computed as: `20 * log10(sqrt(dot(x, x) / len(x)))` (see
+ :class:`AudioEnergyValidator` and
+ :func:`calculate_energy_single_channel`). If `validator` is given, this
+ argument is ignored.
+
+ Yields
+ ------
+ AudioRegion
+ a generator of detected :class:`AudioRegion` s.
+ """
+ if min_dur <= 0:
+ raise ValueError("'min_dur' ({}) must be > 0".format(min_dur))
+ if max_dur <= 0:
+ raise ValueError("'max_dur' ({}) must be > 0".format(max_dur))
+ if max_silence < 0:
+ raise ValueError("'max_silence' ({}) must be >= 0".format(max_silence))
+
+ if isinstance(input, AudioReader):
+ source = input
+ analysis_window = source.block_dur
+ else:
+ analysis_window = kwargs.get(
+ "analysis_window", kwargs.get("aw", DEFAULT_ANALYSIS_WINDOW)
+ )
+ if analysis_window <= 0:
+ raise ValueError(
+ "'analysis_window' ({}) must be > 0".format(analysis_window)
+ )
+
+ params = kwargs.copy()
+ params["max_read"] = params.get("max_read", params.get("mr"))
+ params["audio_format"] = params.get("audio_format", params.get("fmt"))
+ if isinstance(input, AudioRegion):
+ params["sampling_rate"] = input.sr
+ params["sample_width"] = input.sw
+ params["channels"] = input.ch
+ input = bytes(input)
+ try:
+ source = AudioReader(input, block_dur=analysis_window, **params)
+ except TooSamllBlockDuration as exc:
+ err_msg = "Too small 'analysis_windows' ({0}) for sampling rate "
+ err_msg += "({1}). Analysis windows should at least be 1/{1} to "
+ err_msg += "cover one single data sample"
+ raise ValueError(err_msg.format(exc.block_dur, exc.sampling_rate))
+
+ validator = kwargs.get("validator", kwargs.get("val"))
+ if validator is None:
+ energy_threshold = kwargs.get(
+ "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
+ )
+ use_channel = kwargs.get("use_channel", kwargs.get("uc"))
+ validator = AudioEnergyValidator(
+ energy_threshold, source.sw, source.ch, use_channel=use_channel
+ )
+ mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0
+ if strict_min_dur:
+ mode |= StreamTokenizer.STRICT_MIN_LENGTH
+ min_length = _duration_to_nb_windows(min_dur, analysis_window, math.ceil)
+ max_length = _duration_to_nb_windows(
+ max_dur, analysis_window, math.floor, _EPSILON
+ )
+ max_continuous_silence = _duration_to_nb_windows(
+ max_silence, analysis_window, math.floor, _EPSILON
+ )
+
+ err_msg = "({0} sec.) results in {1} analysis window(s) "
+ err_msg += "({1} == {6}({0} / {2})) which is {5} the number "
+ err_msg += "of analysis window(s) for 'max_dur' ({3} == floor({4} / {2}))"
+ if min_length > max_length:
+ err_msg = "'min_dur' " + err_msg
+ raise ValueError(
+ err_msg.format(
+ min_dur,
+ min_length,
+ analysis_window,
+ max_length,
+ max_dur,
+ "higher than",
+ "ceil",
+ )
+ )
+
+ if max_continuous_silence >= max_length:
+ err_msg = "'max_silence' " + err_msg
+ raise ValueError(
+ err_msg.format(
+ max_silence,
+ max_continuous_silence,
+ analysis_window,
+ max_length,
+ max_dur,
+ "higher or equal to",
+ "floor",
+ )
+ )
+
+ tokenizer = StreamTokenizer(
+ validator, min_length, max_length, max_continuous_silence, mode=mode
+ )
+ source.open()
+ token_gen = tokenizer.tokenize(source, generator=True)
+ region_gen = (
+ _make_audio_region(
+ token[0],
+ token[1],
+ source.block_dur,
+ source.sr,
+ source.sw,
+ source.ch,
+ )
+ for token in token_gen
+ )
+ return region_gen
+
+
+def _duration_to_nb_windows(
+ duration, analysis_window, round_fn=round, epsilon=0
+):
+ """
+ Converts a given duration into a positive integer of analysis windows.
+ if `duration / analysis_window` is not an integer, the result will be
+ rounded to the closest bigger integer. If `duration == 0`, returns `0`.
+ If `duration < analysis_window`, returns 1.
+ `duration` and `analysis_window` can be in seconds or milliseconds but
+ must be in the same unit.
+
+ Parameters
+ ----------
+ duration : float
+ a given duration in seconds or ms.
+ analysis_window: float
+ size of analysis window, in the same unit as `duration`.
+ round_fn : callable
+ function called to round the result. Default: `round`.
+ epsilon : float
+ small value to add to the division result before rounding.
+ E.g., `0.3 / 0.1 = 2.9999999999999996`, when called with
+ `round_fn=math.floor` returns `2` instead of `3`. Adding a small value
+ to `0.3 / 0.1` avoids this error.
+
+ Returns
+ -------
+ nb_windows : int
+ minimum number of `analysis_window`'s to cover `durartion`. That means
+ that `analysis_window * nb_windows >= duration`.
+ """
+ if duration < 0 or analysis_window <= 0:
+ err_msg = "'duration' ({}) must be >= 0 and 'analysis_window' ({}) > 0"
+ raise ValueError(err_msg.format(duration, analysis_window))
+ if duration == 0:
+ return 0
+ return int(round_fn(duration / analysis_window + epsilon))
+
+
+def _make_audio_region(
+ data_frames,
+ start_frame,
+ frame_duration,
+ sampling_rate,
+ sample_width,
+ channels,
+):
+ """
+ Helper function to create an `AudioRegion` from parameters returned by
+ tokenization object. It takes care of setting up region `start` and `end`
+ in metadata.
+
+ Parameters
+ ----------
+ frame_duration: float
+ duration of analysis window in seconds
+ start_frame : int
+ index of the fisrt analysis window
+ samling_rate : int
+ sampling rate of audio data
+ sample_width : int
+ number of bytes of one audio sample
+ channels : int
+ number of channels of audio data
+
+ Returns
+ -------
+ audio_region : AudioRegion
+ AudioRegion whose start time is calculeted as:
+ `1000 * start_frame * frame_duration`
+ """
+ start = start_frame * frame_duration
+ data = b"".join(data_frames)
+ duration = len(data) / (sampling_rate * sample_width * channels)
+ meta = {"start": start, "end": start + duration}
+ return AudioRegion(data, sampling_rate, sample_width, channels, meta)
+
+
+def _read_chunks_online(max_read, **kwargs):
+ """
+ Helper function to read audio data from an online blocking source
+ (i.e., microphone). Used to build an `AudioRegion` and can intercept
+ KeyboardInterrupt so that reading stops as soon as this exception is
+ raised. Makes building `AudioRegion`s on [i]python sessions and jupyter
+ notebooks more user friendly.
+
+ Parameters
+ ----------
+ max_read : float
+ maximum amount of data to read in seconds.
+ kwargs :
+ audio parameters (sampling_rate, sample_width and channels).
+
+ See also
+ --------
+ `AudioRegion.build`
+ """
+ reader = AudioReader(None, block_dur=0.5, max_read=max_read, **kwargs)
+ reader.open()
+ data = []
+ try:
+ while True:
+ frame = reader.read()
+ if frame is None:
+ break
+ data.append(frame)
+ except KeyboardInterrupt:
+ # Stop data acquisition from microphone when pressing
+ # Ctrl+C on a [i]python session or a notebook
+ pass
+ reader.close()
+ return (
+ b"".join(data),
+ reader.sampling_rate,
+ reader.sample_width,
+ reader.channels,
+ )
+
+
+def _read_offline(input, skip=0, max_read=None, **kwargs):
+ """
+ Helper function to read audio data from an offline (i.e., file). Used to
+ build `AudioRegion`s.
+
+ Parameters
+ ----------
+ input : str, bytes
+ path to audio file (if str), or a bytes object representing raw audio
+ data.
+ skip : float, default 0
+ amount of data to skip from the begining of audio source.
+ max_read : float, default: None
+ maximum amount of audio data to read. Default: None, means read until
+ end of stream.
+ kwargs :
+ audio parameters (sampling_rate, sample_width and channels).
+
+ See also
+ --------
+ `AudioRegion.build`
+
+ """
+ audio_source = get_audio_source(input, **kwargs)
+ audio_source.open()
+ if skip is not None and skip > 0:
+ skip_samples = round(skip * audio_source.sampling_rate)
+ audio_source.read(skip_samples)
+ if max_read is not None:
+ if max_read < 0:
+ max_read = None
+ else:
+ max_read = round(max_read * audio_source.sampling_rate)
+ data = audio_source.read(max_read)
+ audio_source.close()
+ return (
+ data,
+ audio_source.sampling_rate,
+ audio_source.sample_width,
+ audio_source.channels,
+ )
+
+
+def _check_convert_index(index, types, err_msg):
+ if not isinstance(index, slice) or index.step is not None:
+ raise TypeError(err_msg)
+ start = index.start if index.start is not None else 0
+ stop = index.stop
+ for index in (start, stop):
+ if index is not None and not isinstance(index, types):
+ raise TypeError(err_msg)
+ return start, stop
+
+
+class _SecondsView:
+ """A class to create a view of `AudioRegion` that can be sliced using
+ indices in seconds.
+ """
+
+ def __init__(self, region):
+ self._region = region
+
+ def __getitem__(self, index):
+ err_msg = "Slicing AudioRegion by seconds requires indices of type "
+ err_msg += "'int' or 'float' without a step (e.g. region.sec[7.5:10])"
+ start_s, stop_s = _check_convert_index(index, (int, float), err_msg)
+ sr = self._region.sampling_rate
+ start_sample = int(start_s * sr)
+ stop_sample = None if stop_s is None else round(stop_s * sr)
+ return self._region[start_sample:stop_sample]
+
+ @property
+ def len(self):
+ """
+ Return region duration in seconds.
+ """
+ return self._region.duration
+
+
+class _MillisView(_SecondsView):
+ """A class to create a view of `AudioRegion` that can be sliced using
+ indices in milliseconds.
+ """
+
+ def __getitem__(self, index):
+ err_msg = (
+ "Slicing AudioRegion by milliseconds requires indices of type "
+ )
+ err_msg += "'int' without a step (e.g. region.sec[500:1500])"
+ start_ms, stop_ms = _check_convert_index(index, (int), err_msg)
+ start_sec = start_ms / 1000
+ stop_sec = None if stop_ms is None else stop_ms / 1000
+ index = slice(start_sec, stop_sec)
+ return super(_MillisView, self).__getitem__(index)
+
+ def __len__(self):
+ """
+ Return region duration in milliseconds.
+ """
+ return round(self._region.duration * 1000)
+
+ @property
+ def len(self):
+ """
+ Return region duration in milliseconds.
+ """
+ return len(self)
+
+
+class _AudioRegionMetadata(dict):
+ """A class to store `AudioRegion`'s metadata."""
+
+ def __getattr__(self, name):
+ if name in self:
+ return self[name]
+ else:
+ err_msg = "AudioRegion metadata has no entry '{}'"
+ raise AttributeError(err_msg.format(name))
+
+ def __setattr__(self, name, value):
+ self[name] = value
+
+ def __str__(self):
+ return "\n".join("{}: {}".format(k, v) for k, v in self.items())
+
+ def __repr__(self):
+ return str(self)
+
+
+class AudioRegion(object):
+ """
+ AudioRegion encapsulates raw audio data and provides an interface to
+ perform simple operations on it. Use `AudioRegion.load` to build an
+ `AudioRegion` from different types of objects.
+
+ Parameters
+ ----------
+ data : bytes
+ raw audio data as a bytes object
+ sampling_rate : int
+ sampling rate of audio data
+ sample_width : int
+ number of bytes of one audio sample
+ channels : int
+ number of channels of audio data
+ meta : dict, default: None
+ any collection of <key:value> elements used to build metadata for
+ this `AudioRegion`. Meta data can be accessed via `region.meta.key`
+ if `key` is a valid python attribute name, or via `region.meta[key]`
+ if not. Note that the :func:`split` function (or the
+ :meth:`AudioRegion.split` method) returns `AudioRegions` with a ``start``
+ and a ``stop`` meta values that indicate the location in seconds of the
+ region in original audio data.
+
+ See also
+ --------
+ AudioRegion.load
+
+ """
+
+ def __init__(self, data, sampling_rate, sample_width, channels, meta=None):
+ check_audio_data(data, sample_width, channels)
+ self._data = data
+ self._sampling_rate = sampling_rate
+ self._sample_width = sample_width
+ self._channels = channels
+ self._samples = None
+ self.splitp = self.split_and_plot
+
+ if meta is not None:
+ self._meta = _AudioRegionMetadata(meta)
+ else:
+ self._meta = None
+
+ self._seconds_view = _SecondsView(self)
+ self.sec = self.seconds
+ self.s = self.seconds
+
+ self._millis_view = _MillisView(self)
+ self.ms = self.millis
+
+ @property
+ def meta(self):
+ return self._meta
+
+ @meta.setter
+ def meta(self, new_meta):
+ """Meta data of audio region."""
+ self._meta = _AudioRegionMetadata(new_meta)
+
+ @classmethod
+ def load(cls, input, skip=0, max_read=None, **kwargs):
+ """
+ Create an `AudioRegion` by loading data from `input`. See :func:`load`
+ for parameters descripion.
+
+ Returns
+ -------
+ region: AudioRegion
+
+ Raises
+ ------
+ ValueError
+ raised if `input` is None and `skip` != 0 or `max_read` is None.
+ """
+ if input is None:
+ if skip > 0:
+ raise ValueError(
+ "'skip' should be 0 when reading from microphone"
+ )
+ if max_read is None or max_read < 0:
+ raise ValueError(
+ "'max_read' should not be None when reading from "
+ "microphone"
+ )
+ data, sampling_rate, sample_width, channels = _read_chunks_online(
+ max_read, **kwargs
+ )
+ else:
+ data, sampling_rate, sample_width, channels = _read_offline(
+ input, skip=skip, max_read=max_read, **kwargs
+ )
+
+ return cls(data, sampling_rate, sample_width, channels)
+
+ @property
+ def seconds(self):
+ """
+ A view to slice audio region by seconds (using ``region.seconds[start:end]``).
+ """
+ return self._seconds_view
+
+ @property
+ def millis(self):
+ """A view to slice audio region by milliseconds (using ``region.millis[start:end]``)."""
+ return self._millis_view
+
+ @property
+ def duration(self):
+ """
+ Returns region duration in seconds.
+ """
+ return len(self._data) / (
+ self.sampling_rate * self.sample_width * self.channels
+ )
+
+ @property
+ def sampling_rate(self):
+ """Samling rate of audio data."""
+ return self._sampling_rate
+
+ @property
+ def sr(self):
+ """Samling rate of audio data, alias for `sampling_rate`."""
+ return self._sampling_rate
+
+ @property
+ def sample_width(self):
+ """Number of bytes per sample, one channel considered."""
+ return self._sample_width
+
+ @property
+ def sw(self):
+ """Number of bytes per sample, alias for `sampling_rate`."""
+ return self._sample_width
+
+ @property
+ def channels(self):
+ """Number of channels of audio data."""
+ return self._channels
+
+ @property
+ def ch(self):
+ """Number of channels of audio data, alias for `channels`."""
+ return self._channels
+
+ def play(self, progress_bar=False, player=None, **progress_bar_kwargs):
+ """
+ Play audio region.
+
+ Parameters
+ ----------
+ progress_bar : bool, default: False
+ whether to use a progress bar while playing audio. Default: False.
+ `progress_bar` requires `tqdm`, if not installed, no progress bar
+ will be shown.
+ player : AudioPalyer, default: None
+ audio player to use. if None (default), use `player_for()`
+ to get a new audio player.
+ progress_bar_kwargs : kwargs
+ keyword arguments to pass to `tqdm` progress_bar builder (e.g.,
+ use `leave=False` to clean up the screen when play finishes).
+ """
+ if player is None:
+ player = player_for(self)
+ player.play(
+ self._data, progress_bar=progress_bar, **progress_bar_kwargs
+ )
+
+ def save(self, file, audio_format=None, exists_ok=True, **audio_parameters):
+ """
+ Save audio region to file.
-from auditok.util import DataValidator
+ Parameters
+ ----------
+ file : str
+ path to output audio file. May contain `{duration}` placeholder
+ as well as any place holder that this region's metadata might
+ contain (e.g., regions returned by `split` contain metadata with
+ `start` and `end` attributes that can be used to build output file
+ name as `{meta.start}` and `{meta.end}`. See examples using
+ placeholders with formatting.
-__all__ = ["StreamTokenizer"]
+ audio_format : str, default: None
+ format used to save audio data. If None (default), format is guessed
+ from file name's extension. If file name has no extension, audio
+ data is saved as a raw (headerless) audio file.
+ exists_ok : bool, default: True
+ If True, overwrite `file` if a file with the same name exists.
+ If False, raise an `IOError` if `file` exists.
+ audio_parameters: dict
+ any keyword arguments to be passed to audio saving backend.
+ Returns
+ -------
+ file: str
+ name of output file with replaced placehoders.
+ Raises
+ IOError if `file` exists and `exists_ok` is False.
-class StreamTokenizer():
+
+ Examples
+ --------
+ >>> region = AudioRegion(b'\\0' * 2 * 24000,
+ >>> sampling_rate=16000,
+ >>> sample_width=2,
+ >>> channels=1)
+ >>> region.meta.start = 2.25
+ >>> region.meta.end = 2.25 + region.duration
+ >>> region.save('audio_{meta.start}-{meta.end}.wav')
+ >>> audio_2.25-3.75.wav
+ >>> region.save('region_{meta.start:.3f}_{duration:.3f}.wav')
+ audio_2.250_1.500.wav
+ """
+ if isinstance(file, str):
+ file = file.format(duration=self.duration, meta=self.meta)
+ if not exists_ok and os.path.exists(file):
+ raise FileExistsError("file '{file}' exists".format(file=file))
+ to_file(
+ self._data,
+ file,
+ audio_format,
+ sr=self.sr,
+ sw=self.sw,
+ ch=self.ch,
+ audio_parameters=audio_parameters,
+ )
+ return file
+
+ def split(
+ self,
+ min_dur=0.2,
+ max_dur=5,
+ max_silence=0.3,
+ drop_trailing_silence=False,
+ strict_min_dur=False,
+ **kwargs
+ ):
+ """Split audio region. See :func:`auditok.split()` for a comprehensive
+ description of split parameters.
+ See Also :meth:`AudioRegio.split_and_plot`.
+ """
+ if kwargs.get("max_read", kwargs.get("mr")) is not None:
+ warn_msg = "'max_read' (or 'mr') should not be used with "
+ warn_msg += "AudioRegion.split_and_plot(). You should rather "
+ warn_msg += "slice audio region before calling this method"
+ raise RuntimeWarning(warn_msg)
+ return split(
+ self,
+ min_dur=min_dur,
+ max_dur=max_dur,
+ max_silence=max_silence,
+ drop_trailing_silence=drop_trailing_silence,
+ strict_min_dur=strict_min_dur,
+ **kwargs
+ )
+
+ def plot(
+ self,
+ scale_signal=True,
+ show=True,
+ figsize=None,
+ save_as=None,
+ dpi=120,
+ theme="auditok",
+ ):
+ """Plot audio region, one sub-plot for each channel.
+
+ Parameters
+ ----------
+ scale_signal : bool, default: True
+ if true, scale signal by subtracting its mean and dividing by its
+ standard deviation before plotting.
+ show : bool
+ whether to show plotted signal right after the call.
+ figsize : tuple, default: None
+ width and height of the figure to pass to `matplotlib`.
+ save_as : str, default None.
+ if provided, also save plot to file.
+ dpi : int, default: 120
+ plot dpi to pass to `matplotlib`.
+ theme : str or dict, default: "auditok"
+ plot theme to use. Currently only "auditok" theme is implemented. To
+ provide you own them see :attr:`auditok.plotting.AUDITOK_PLOT_THEME`.
+ """
+ try:
+ from auditok.plotting import plot
+
+ plot(
+ self,
+ scale_signal=scale_signal,
+ show=show,
+ figsize=figsize,
+ save_as=save_as,
+ dpi=dpi,
+ theme=theme,
+ )
+ except ImportError:
+ raise RuntimeWarning("Plotting requires matplotlib")
+
+ def split_and_plot(
+ self,
+ min_dur=0.2,
+ max_dur=5,
+ max_silence=0.3,
+ drop_trailing_silence=False,
+ strict_min_dur=False,
+ scale_signal=True,
+ show=True,
+ figsize=None,
+ save_as=None,
+ dpi=120,
+ theme="auditok",
+ **kwargs
+ ):
+ """Split region and plot signal and detections. Alias: :meth:`splitp`.
+ See :func:`auditok.split()` for a comprehensive description of split
+ parameters. Also see :meth:`plot` for plot parameters.
+ """
+ try:
+ from auditok.plotting import plot
+
+ regions = self.split(
+ min_dur=min_dur,
+ max_dur=max_dur,
+ max_silence=max_silence,
+ drop_trailing_silence=drop_trailing_silence,
+ strict_min_dur=strict_min_dur,
+ **kwargs
+ )
+ regions = list(regions)
+ detections = ((reg.meta.start, reg.meta.end) for reg in regions)
+ eth = kwargs.get(
+ "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
+ )
+ plot(
+ self,
+ scale_signal=scale_signal,
+ detections=detections,
+ energy_threshold=eth,
+ show=show,
+ figsize=figsize,
+ save_as=save_as,
+ dpi=dpi,
+ theme=theme,
+ )
+ return regions
+ except ImportError:
+ raise RuntimeWarning("Plotting requires matplotlib")
+
+ def __array__(self):
+ return self.samples
+
+ @property
+ def samples(self):
+ """Audio region as arrays of samples, one array per channel."""
+ if self._samples is None:
+ self._samples = signal.to_array(
+ self._data, self.sample_width, self.channels
+ )
+ return self._samples
+
+ def __len__(self):
+ """
+ Return region length in number of samples.
+ """
+ return len(self._data) // (self.sample_width * self.channels)
+
+ @property
+ def len(self):
+ """
+ Return region length in number of samples.
+ """
+ return len(self)
+
+ def __bytes__(self):
+ return self._data
+
+ def __str__(self):
+ return (
+ "AudioRegion(duration={:.3f}, "
+ "sampling_rate={}, sample_width={}, channels={})".format(
+ self.duration, self.sr, self.sw, self.ch
+ )
+ )
+
+ def __repr__(self):
+ return str(self)
+
+ def __add__(self, other):
+ """
+ Concatenates this region and `other` and return a new region.
+ Both regions must have the same sampling rate, sample width
+ and number of channels. If not, raises a `ValueError`.
+ """
+ if not isinstance(other, AudioRegion):
+ raise TypeError(
+ "Can only concatenate AudioRegion, "
+ 'not "{}"'.format(type(other))
+ )
+ if other.sr != self.sr:
+ raise ValueError(
+ "Can only concatenate AudioRegions of the same "
+ "sampling rate ({} != {})".format(self.sr, other.sr)
+ )
+ if other.sw != self.sw:
+ raise ValueError(
+ "Can only concatenate AudioRegions of the same "
+ "sample width ({} != {})".format(self.sw, other.sw)
+ )
+ if other.ch != self.ch:
+ raise ValueError(
+ "Can only concatenate AudioRegions of the same "
+ "number of channels ({} != {})".format(self.ch, other.ch)
+ )
+ data = self._data + other._data
+ return AudioRegion(data, self.sr, self.sw, self.ch)
+
+ def __radd__(self, other):
+ """
+ Concatenates `other` and this region. `other` should be an
+ `AudioRegion` with the same audio parameters as this region
+ but can exceptionally be `0` to make it possible to concatenate
+ many regions with `sum`.
+ """
+ if other == 0:
+ return self
+ return other.add(self)
+
+ def __mul__(self, n):
+ if not isinstance(n, int):
+ err_msg = "Can't multiply AudioRegion by a non-int of type '{}'"
+ raise TypeError(err_msg.format(type(n)))
+ data = self._data * n
+ return AudioRegion(data, self.sr, self.sw, self.ch)
+
+ def __rmul__(self, n):
+ return self * n
+
+ def __truediv__(self, n):
+ if not isinstance(n, int) or n <= 0:
+ raise TypeError("AudioRegion can only be divided by a positive int")
+ samples_per_sub_region, rest = divmod(len(self), n)
+ onset = 0
+ sub_regions = []
+ while onset < len(self):
+ offset = 0
+ if rest > 0:
+ offset = 1
+ rest -= 1
+ offset += onset + samples_per_sub_region
+ sub_regions.append(self[onset:offset])
+ onset = offset
+ return sub_regions
+
+ def __eq__(self, other):
+ if other is self:
+ return True
+ if not isinstance(other, AudioRegion):
+ return False
+ return (
+ (self._data == other._data)
+ and (self.sr == other.sr)
+ and (self.sw == other.sw)
+ and (self.ch == other.ch)
+ )
+
+ def __getitem__(self, index):
+ err_msg = "Slicing AudioRegion by samples requires indices of type "
+ err_msg += "'int' without a step (e.g. region.sec[1600:3200])"
+ start_sample, stop_sample = _check_convert_index(index, (int), err_msg)
+
+ bytes_per_sample = self.sample_width * self.channels
+ len_samples = len(self._data) // bytes_per_sample
+
+ if start_sample < 0:
+ start_sample = max(start_sample + len_samples, 0)
+ onset = start_sample * bytes_per_sample
+
+ if stop_sample is not None:
+ if stop_sample < 0:
+ stop_sample = max(stop_sample + len_samples, 0)
+ offset = index.stop * bytes_per_sample
+ else:
+ offset = None
+
+ data = self._data[onset:offset]
+ return AudioRegion(data, self.sr, self.sw, self.ch)
+
+
+class StreamTokenizer:
"""
Class for stream tokenizers. It implements a 4-state automaton scheme
to extract sub-sequences of interest on the fly.
-
- :Parameters:
-
- `validator` :
- instance of `DataValidator` that implements `is_valid` method.
-
- `min_length` : *(int)*
- Minimum number of frames of a valid token. This includes all \
- tolerated non valid frames within the token.
-
- `max_length` : *(int)*
- Maximum number of frames of a valid token. This includes all \
- tolerated non valid frames within the token.
-
- `max_continuous_silence` : *(int)*
- Maximum number of consecutive non-valid frames within a token.
- Note that, within a valid token, there may be many tolerated \
- *silent* regions that contain each a number of non valid frames up to \
- `max_continuous_silence`
-
- `init_min` : *(int, default=0)*
- Minimum number of consecutive valid frames that must be **initially** \
- gathered before any sequence of non valid frames can be tolerated. This
- option is not always needed, it can be used to drop non-valid tokens as
- early as possible. **Default = 0** means that the option is by default
- ineffective.
-
- `init_max_silence` : *(int, default=0)*
- Maximum number of tolerated consecutive non-valid frames if the \
- number already gathered valid frames has not yet reached 'init_min'.
- This argument is normally used if `init_min` is used. **Default = 0**,
- by default this argument is not taken into consideration.
-
- `mode` : *(int, default=0)*
- `mode` can be:
-
- 1. `StreamTokenizer.STRICT_MIN_LENGTH`:
- if token *i* is delivered because `max_length`
- is reached, and token *i+1* is immediately adjacent to
- token *i* (i.e. token *i* ends at frame *k* and token *i+1* starts
- at frame *k+1*) then accept token *i+1* only of it has a size of at
- least `min_length`. The default behavior is to accept token *i+1*
- event if it is shorter than `min_length` (given that the above conditions
- are fulfilled of course).
-
- :Examples:
-
- In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
- accepted although it is shorter than `min_length` (3), because it immediately
- follows the latest delivered token:
-
- .. code:: python
-
- from auditok import StreamTokenizer, StringDataSource, DataValidator
-
- class UpperCaseChecker(DataValidator):
- def is_valid(self, frame):
- return frame.isupper()
-
-
- dsource = StringDataSource("aaaAAAABBbbb")
- tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
- min_length=3,
- max_length=4,
- max_continuous_silence=0)
-
- tokenizer.tokenize(dsource)
-
-
- :output:
-
- .. code:: python
-
- [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
-
-
- The following tokenizer will however reject the 'BB' token:
-
- .. code:: python
-
- dsource = StringDataSource("aaaAAAABBbbb")
- tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
- min_length=3, max_length=4,
- max_continuous_silence=0,
- mode=StreamTokenizer.STRICT_MIN_LENGTH)
- tokenizer.tokenize(dsource)
-
- :output:
-
- .. code:: python
-
- [(['A', 'A', 'A', 'A'], 3, 6)]
-
-
- 2. `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing non-valid frames
- from a token to be delivered if and only if it is not **truncated**.
- This can be a bit tricky. A token is actually delivered if:
-
- - a. `max_continuous_silence` is reached
-
- :or:
-
- - b. Its length reaches `max_length`. This is called a **truncated** token
-
- In the current implementation, a `StreamTokenizer`'s decision is only based on already seen
- data and on incoming data. Thus, if a token is truncated at a non-valid but tolerated
- frame (`max_length` is reached but `max_continuous_silence` not yet) any tailing
- silence will be kept because it can potentially be part of valid token (if `max_length`
- was bigger). But if `max_continuous_silence` is reached before `max_length`, the delivered
- token will not be considered as truncated but a result of *normal* end of detection
- (i.e. no more valid data). In that case the tailing silence can be removed if you use
- the `StreamTokenizer.DROP_TRAILING_SILENCE` mode.
-
- :Example:
-
- .. code:: python
-
- tokenizer = StreamTokenizer(validator=UpperCaseChecker(), min_length=3,
- max_length=6, max_continuous_silence=3,
- mode=StreamTokenizer.DROP_TRAILING_SILENCE)
-
- dsource = StringDataSource("aaaAAAaaaBBbbbb")
- tokenizer.tokenize(dsource)
-
- :output:
-
- .. code:: python
-
- [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
-
- The first token is delivered with its tailing silence because it is truncated
- while the second one has its tailing frames removed.
-
- Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
-
- .. code:: python
-
- [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13)]
-
-
-
- 3. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`:
- use both options. That means: first remove tailing silence, then ckeck if the
- token still has at least a length of `min_length`.
+
+ Parameters
+ ----------
+ validator : callable, DataValidator (must implement `is_valid`)
+ called with each data frame read from source. Should take one positional
+ argument and return True or False for valid and invalid frames
+ respectively.
+
+ min_length : int
+ Minimum number of frames of a valid token. This includes all
+ tolerated non valid frames within the token.
+
+ max_length : int
+ Maximum number of frames of a valid token. This includes all
+ tolerated non valid frames within the token.
+
+ max_continuous_silence : int
+ Maximum number of consecutive non-valid frames within a token.
+ Note that, within a valid token, there may be many tolerated
+ *silent* regions that contain each a number of non valid frames up
+ to `max_continuous_silence`
+
+ init_min : int
+ Minimum number of consecutive valid frames that must be
+ **initially** gathered before any sequence of non valid frames can
+ be tolerated. This option is not always needed, it can be used to
+ drop non-valid tokens as early as possible. **Default = 0** means
+ that the option is by default ineffective.
+
+ init_max_silence : int
+ Maximum number of tolerated consecutive non-valid frames if the
+ number already gathered valid frames has not yet reached
+ 'init_min'.This argument is normally used if `init_min` is used.
+ **Default = 0**, by default this argument is not taken into
+ consideration.
+
+ mode : int
+ mode can be one of the following:
+
+ -1 `StreamTokenizer.NORMAL` : do not drop trailing silence, and
+ accept a token shorter than `min_length` if it is the continuation
+ of the latest delivered token.
+
+ -2 `StreamTokenizer.STRICT_MIN_LENGTH`: if token `i` is delivered
+ because `max_length` is reached, and token `i+1` is immediately
+ adjacent to token `i` (i.e. token `i` ends at frame `k` and token
+ `i+1` starts at frame `k+1`) then accept token `i+1` only of it has
+ a size of at least `min_length`. The default behavior is to accept
+ token `i+1` event if it is shorter than `min_length` (provided that
+ the above conditions are fulfilled of course).
+
+ -3 `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing
+ non-valid frames from a token to be delivered if and only if it
+ is not **truncated**. This can be a bit tricky. A token is actually
+ delivered if:
+
+ - `max_continuous_silence` is reached.
+
+ - Its length reaches `max_length`. This is referred to as a
+ **truncated** token.
+
+ In the current implementation, a `StreamTokenizer`'s decision is only
+ based on already seen data and on incoming data. Thus, if a token is
+ truncated at a non-valid but tolerated frame (`max_length` is reached
+ but `max_continuous_silence` not yet) any tailing silence will be kept
+ because it can potentially be part of valid token (if `max_length` was
+ bigger). But if `max_continuous_silence` is reached before
+ `max_length`, the delivered token will not be considered as truncated
+ but a result of *normal* end of detection (i.e. no more valid data).
+ In that case the trailing silence can be removed if you use the
+ `StreamTokenizer.DROP_TRAILING_SILENCE` mode.
+
+ -4 `(StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE)`:
+ use both options. That means: first remove tailing silence, then
+ check if the token still has a length of at least `min_length`.
+
+
+
+
+ Examples
+ --------
+
+ In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
+ accepted although it is shorter than `min_length` (3), because it
+ immediately follows the latest delivered token:
+
+ >>> from auditok.core import StreamTokenizer
+ >>> from StringDataSource, DataValidator
+
+ >>> class UpperCaseChecker(DataValidator):
+ >>> def is_valid(self, frame):
+ return frame.isupper()
+ >>> dsource = StringDataSource("aaaAAAABBbbb")
+ >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
+ min_length=3,
+ max_length=4,
+ max_continuous_silence=0)
+ >>> tokenizer.tokenize(dsource)
+ [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
+
+
+ The following tokenizer will however reject the 'BB' token:
+
+ >>> dsource = StringDataSource("aaaAAAABBbbb")
+ >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
+ min_length=3, max_length=4,
+ max_continuous_silence=0,
+ mode=StreamTokenizer.STRICT_MIN_LENGTH)
+ >>> tokenizer.tokenize(dsource)
+ [(['A', 'A', 'A', 'A'], 3, 6)]
+
+
+
+ >>> tokenizer = StreamTokenizer(
+ >>> validator=UpperCaseChecker(),
+ >>> min_length=3,
+ >>> max_length=6,
+ >>> max_continuous_silence=3,
+ >>> mode=StreamTokenizer.DROP_TRAILING_SILENCE
+ >>> )
+ >>> dsource = StringDataSource("aaaAAAaaaBBbbbb")
+ >>> tokenizer.tokenize(dsource)
+ [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
+
+ The first token is delivered with its tailing silence because it is
+ truncated while the second one has its tailing frames removed.
+
+ Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
+
+ .. code:: python
+
+ [
+ (['A', 'A', 'A', 'a', 'a', 'a'], 3, 8),
+ (['B', 'B', 'b', 'b', 'b'], 9, 13)
+ ]
+
"""
-
-
+
SILENCE = 0
POSSIBLE_SILENCE = 1
- POSSIBLE_NOISE = 2
+ POSSIBLE_NOISE = 2
NOISE = 3
-
+ NORMAL = 0
STRICT_MIN_LENGTH = 2
DROP_TRAILING_SILENCE = 4
- # alias
- DROP_TAILING_SILENCE = 4
-
- def __init__(self, validator,
- min_length, max_length, max_continuous_silence,
- init_min=0, init_max_silence=0,
- mode=0):
-
- if not isinstance(validator, DataValidator):
- raise TypeError("'validator' must be an instance of 'DataValidator'")
-
+
+ def __init__(
+ self,
+ validator,
+ min_length,
+ max_length,
+ max_continuous_silence,
+ init_min=0,
+ init_max_silence=0,
+ mode=0,
+ ):
+ if callable(validator):
+ self._is_valid = validator
+ elif isinstance(validator, DataValidator):
+ self._is_valid = validator.is_valid
+ else:
+ raise TypeError(
+ "'validator' must be a callable or an instance of "
+ "DataValidator"
+ )
+
if max_length <= 0:
- raise ValueError("'max_length' must be > 0 (value={0})".format(max_length))
-
+ raise ValueError(
+ "'max_length' must be > 0 (value={0})".format(max_length)
+ )
+
if min_length <= 0 or min_length > max_length:
- raise ValueError("'min_length' must be > 0 and <= 'max_length' (value={0})".format(min_length))
-
+ err_msg = "'min_length' must be > 0 and <= 'max_length' (value={0})"
+ raise ValueError(err_msg.format(min_length))
+
if max_continuous_silence >= max_length:
- raise ValueError("'max_continuous_silence' must be < 'max_length' (value={0})".format(max_continuous_silence))
-
+ err_msg = "'max_continuous_silence' must be < 'max_length' "
+ err_msg += "(value={0})"
+ raise ValueError(err_msg.format(max_continuous_silence))
+
if init_min >= max_length:
- raise ValueError("'init_min' must be < 'max_length' (value={0})".format(max_continuous_silence))
-
+ raise ValueError(
+ "'init_min' must be < 'max_length' (value={0})".format(
+ max_continuous_silence
+ )
+ )
+
self.validator = validator
self.min_length = min_length
self.max_length = max_length
self.max_continuous_silence = max_continuous_silence
self.init_min = init_min
self.init_max_silent = init_max_silence
-
- self._mode = None
- self.set_mode(mode)
- self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
- self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
-
+ self._set_mode(mode)
self._deliver = None
self._tokens = None
self._state = None
self._data = None
self._contiguous_token = False
-
self._init_count = 0
self._silence_length = 0
self._start_frame = 0
self._current_frame = 0
-
- def set_mode(self, mode):
- """
- :Parameters:
-
- `mode` : *(int)*
- New mode, must be one of:
-
-
- - `StreamTokenizer.STRICT_MIN_LENGTH`
-
- - `StreamTokenizer.DROP_TRAILING_SILENCE`
-
- - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`
-
- - `0`
-
- See `StreamTokenizer.__init__` for more information about the mode.
- """
-
- if not mode in [self.STRICT_MIN_LENGTH, self.DROP_TRAILING_SILENCE,
- self.STRICT_MIN_LENGTH | self.DROP_TRAILING_SILENCE, 0]:
-
+
+ def _set_mode(self, mode):
+ strict_min_and_drop_trailing = StreamTokenizer.STRICT_MIN_LENGTH
+ strict_min_and_drop_trailing |= StreamTokenizer.DROP_TRAILING_SILENCE
+ if mode not in [
+ StreamTokenizer.NORMAL,
+ StreamTokenizer.STRICT_MIN_LENGTH,
+ StreamTokenizer.DROP_TRAILING_SILENCE,
+ strict_min_and_drop_trailing,
+ ]:
raise ValueError("Wrong value for mode")
-
self._mode = mode
self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
- self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
-
-
- def get_mode(self):
- """
- Return the current mode. To check whether a specific mode is activated use
- the bitwise 'and' operator `&`. Example:
-
- .. code:: python
-
- if mode & self.STRICT_MIN_LENGTH != 0:
- do_something()
- """
- return self._mode
-
+ self._drop_trailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
+
def _reinitialize(self):
self._contiguous_token = False
self._data = []
@@ -266,112 +1269,114 @@ class StreamTokenizer():
self._state = self.SILENCE
self._current_frame = -1
self._deliver = self._append_token
-
-
- def tokenize(self, data_source, callback=None):
+
+ def tokenize(self, data_source, callback=None, generator=False):
"""
- Read data from `data_source`, one frame a time, and process the read frames in
- order to detect sequences of frames that make up valid tokens.
-
+ Read data from `data_source`, one frame a time, and process the read
+ frames in order to detect sequences of frames that make up valid
+ tokens.
+
:Parameters:
- `data_source` : instance of the :class:`DataSource` class that implements a `read` method.
- 'read' should return a slice of signal, i.e. frame (of whatever \
- type as long as it can be processed by validator) and None if \
- there is no more signal.
-
+ `data_source` : instance of the :class:`DataSource` class that
+ implements a `read` method. 'read' should return a slice of
+ signal, i.e. frame (of whatever type as long as it can be
+ processed by validator) and None if there is no more signal.
+
`callback` : an optional 3-argument function.
- If a `callback` function is given, it will be called each time a valid token
- is found.
-
-
+ If a `callback` function is given, it will be called each time
+ a valid token is found.
+
+
:Returns:
- A list of tokens if `callback` is None. Each token is tuple with the following elements:
-
+ A list of tokens if `callback` is None. Each token is tuple with the
+ following elements:
+
.. code python
-
+
(data, start, end)
-
- where `data` is a list of read frames, `start`: index of the first frame in the
- original data and `end` : index of the last frame.
-
+
+ where `data` is a list of read frames, `start`: index of the first
+ frame in the original data and `end` : index of the last frame.
"""
-
+ token_gen = self._iter_tokens(data_source)
+ if callback:
+ for token in token_gen:
+ callback(*token)
+ return
+ if generator:
+ return token_gen
+ return list(token_gen)
+
+ def _iter_tokens(self, data_source):
self._reinitialize()
-
- if callback is not None:
- self._deliver = callback
-
while True:
- frame = data_source.read()
+ frame = data_source.read()
+ self._current_frame += 1
if frame is None:
+ token = self._post_process()
+ if token is not None:
+ yield token
break
- self._current_frame += 1
- self._process(frame)
-
- self._post_process()
-
- if callback is None:
- _ret = self._tokens
- self._tokens = None
- return _ret
-
-
- def _process(self, frame):
-
- frame_is_valid = self.validator.is_valid(frame)
-
+ token = self._process(frame)
+ if token is not None:
+ yield token
+
+ def _process(self, frame): # noqa: C901
+
+ frame_is_valid = self._is_valid(frame)
+
if self._state == self.SILENCE:
-
+
if frame_is_valid:
# seems we got a valid frame after a silence
self._init_count = 1
self._silence_length = 0
self._start_frame = self._current_frame
self._data.append(frame)
-
- if self._init_count >= self.init_min:
+
+ if self._init_count >= self.init_min:
self._state = self.NOISE
if len(self._data) >= self.max_length:
- self._process_end_of_detection(True)
+ return self._process_end_of_detection(True)
else:
self._state = self.POSSIBLE_NOISE
-
+
elif self._state == self.POSSIBLE_NOISE:
-
+
if frame_is_valid:
self._silence_length = 0
self._init_count += 1
self._data.append(frame)
- if self._init_count >= self.init_min:
+ if self._init_count >= self.init_min:
self._state = self.NOISE
if len(self._data) >= self.max_length:
- self._process_end_of_detection(True)
-
- else:
+ return self._process_end_of_detection(True)
+
+ else:
self._silence_length += 1
- if self._silence_length > self.init_max_silent or \
- len(self._data) + 1 >= self.max_length:
+ if (
+ self._silence_length > self.init_max_silent
+ or len(self._data) + 1 >= self.max_length
+ ):
# either init_max_silent or max_length is reached
# before _init_count, back to silence
self._data = []
self._state = self.SILENCE
else:
self._data.append(frame)
-
-
+
elif self._state == self.NOISE:
-
+
if frame_is_valid:
self._data.append(frame)
if len(self._data) >= self.max_length:
- self._process_end_of_detection(True)
-
- elif self.max_continuous_silence <= 0 :
- # max token reached at this frame will _deliver if _contiguous_token
- # and not _strict_min_length
- self._process_end_of_detection()
+ return self._process_end_of_detection(True)
+
+ elif self.max_continuous_silence <= 0:
+ # max token reached at this frame will _deliver if
+ # _contiguous_token and not _strict_min_length
self._state = self.SILENCE
-
+ return self._process_end_of_detection()
else:
# this is the first silent frame following a valid one
# and it is tolerated
@@ -379,61 +1384,63 @@ class StreamTokenizer():
self._data.append(frame)
self._state = self.POSSIBLE_SILENCE
if len(self._data) == self.max_length:
- self._process_end_of_detection(True)
- # don't reset _silence_length because we still
+ return self._process_end_of_detection(True)
+ # don't reset _silence_length because we still
# need to know the total number of silent frames
-
-
-
+
elif self._state == self.POSSIBLE_SILENCE:
-
+
if frame_is_valid:
self._data.append(frame)
self._silence_length = 0
self._state = self.NOISE
if len(self._data) >= self.max_length:
- self._process_end_of_detection(True)
-
+ return self._process_end_of_detection(True)
+
else:
if self._silence_length >= self.max_continuous_silence:
- if self._silence_length < len(self._data):
- # _deliver only gathered frames aren't all silent
- self._process_end_of_detection()
- else:
- self._data = []
self._state = self.SILENCE
+ if self._silence_length < len(self._data):
+ # _deliver only gathered frames aren't all silent
+ return self._process_end_of_detection()
+ self._data = []
self._silence_length = 0
else:
self._data.append(frame)
self._silence_length += 1
if len(self._data) >= self.max_length:
- self._process_end_of_detection(True)
- # don't reset _silence_length because we still
+ return self._process_end_of_detection(True)
+ # don't reset _silence_length because we still
# need to know the total number of silent frames
-
-
+
def _post_process(self):
if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE:
if len(self._data) > 0 and len(self._data) > self._silence_length:
- self._process_end_of_detection()
-
-
+ return self._process_end_of_detection()
+
def _process_end_of_detection(self, truncated=False):
-
- if not truncated and self._drop_tailing_silence and self._silence_length > 0:
+
+ if (
+ not truncated
+ and self._drop_trailing_silence
+ and self._silence_length > 0
+ ):
# happens if max_continuous_silence is reached
# or max_length is reached at a silent frame
- self._data = self._data[0: - self._silence_length]
-
- if (len(self._data) >= self.min_length) or \
- (len(self._data) > 0 and \
- not self._strict_min_length and self._contiguous_token):
-
-
-
- _end_frame = self._start_frame + len(self._data) - 1
- self._deliver(self._data, self._start_frame, _end_frame)
-
+ self._data = self._data[0 : -self._silence_length]
+
+ if (len(self._data) >= self.min_length) or (
+ len(self._data) > 0
+ and not self._strict_min_length
+ and self._contiguous_token
+ ):
+
+ start_frame = self._start_frame
+ end_frame = self._start_frame + len(self._data) - 1
+ data = self._data
+ self._data = []
+ token = (data, start_frame, end_frame)
+
if truncated:
# next token (if any) will start at _current_frame + 1
self._start_frame = self._current_frame + 1
@@ -441,12 +1448,11 @@ class StreamTokenizer():
self._contiguous_token = True
else:
self._contiguous_token = False
+ return token
else:
- self._contiguous_token = False
-
+ self._contiguous_token = False
+
self._data = []
-
-
-
+
def _append_token(self, data, start, end):
self._tokens.append((data, start, end))
diff --git a/libs/auditok/dataset.py b/libs/auditok/dataset.py
index 1a3a7af5c..98dc5d1d4 100644
--- a/libs/auditok/dataset.py
+++ b/libs/auditok/dataset.py
@@ -1,19 +1,31 @@
"""
-This module contains links to audio files you can use for test purposes.
+This module contains links to audio files that can be used for test purposes.
+
+.. autosummary::
+ :toctree: generated/
+
+ one_to_six_arabic_16000_mono_bc_noise
+ was_der_mensch_saet_mono_44100_lead_trail_silence
"""
import os
-__all__ = ["one_to_six_arabic_16000_mono_bc_noise", "was_der_mensch_saet_mono_44100_lead_trail_silence"]
+__all__ = [
+ "one_to_six_arabic_16000_mono_bc_noise",
+ "was_der_mensch_saet_mono_44100_lead_trail_silence",
+]
_current_dir = os.path.dirname(os.path.realpath(__file__))
one_to_six_arabic_16000_mono_bc_noise = "{cd}{sep}data{sep}1to6arabic_\
-16000_mono_bc_noise.wav".format(cd=_current_dir, sep=os.path.sep)
+16000_mono_bc_noise.wav".format(
+ cd=_current_dir, sep=os.path.sep
+)
"""A wave file that contains a pronunciation of Arabic numbers from 1 to 6"""
-
was_der_mensch_saet_mono_44100_lead_trail_silence = "{cd}{sep}data{sep}was_\
der_mensch_saet_das_wird_er_vielfach_ernten_44100Hz_mono_lead_trail_\
-silence.wav".format(cd=_current_dir, sep=os.path.sep)
-""" A wave file that contains a sentence between long leading and trailing periods of silence""" \ No newline at end of file
+silence.wav".format(
+ cd=_current_dir, sep=os.path.sep
+)
+"""A wave file that contains a sentence with a long leading and trailing silence"""
diff --git a/libs/auditok/exceptions.py b/libs/auditok/exceptions.py
index 0026a9d89..7bc5054ee 100644
--- a/libs/auditok/exceptions.py
+++ b/libs/auditok/exceptions.py
@@ -1,9 +1,41 @@
-"""
-November 2015
-@author: Amine SEHILI <[email protected]>
-"""
-
class DuplicateArgument(Exception):
pass
+class TooSamllBlockDuration(ValueError):
+ """Raised when block_dur results in a block_size smaller than one sample."""
+
+ def __init__(self, message, block_dur, sampling_rate):
+ self.block_dur = block_dur
+ self.sampling_rate = sampling_rate
+ super(TooSamllBlockDuration, self).__init__(message)
+
+
+class TimeFormatError(Exception):
+ """Raised when a duration formatting directive is unknown."""
+
+
+class EndOfProcessing(Exception):
+ """Raised within command line script's main function to jump to
+ postprocessing code."""
+
+
+class AudioIOError(Exception):
+ """Raised when a compressed audio file cannot be loaded or when trying
+ to read from a not yet open AudioSource"""
+
+
+class AudioParameterError(AudioIOError):
+ """Raised when one audio parameter is missing when loading raw data or
+ saving data to a format other than raw. Also raised when an audio
+ parameter has a wrong value."""
+
+
+class AudioEncodingError(Exception):
+ """Raised if audio data can not be encoded in the provided format"""
+
+
+class AudioEncodingWarning(RuntimeWarning):
+ """Raised if audio data can not be encoded in the provided format
+ but saved as wav.
+ """
diff --git a/libs/auditok/io.py b/libs/auditok/io.py
index 665ab274d..b5fb61a76 100644
--- a/libs/auditok/io.py
+++ b/libs/auditok/io.py
@@ -1,499 +1,1021 @@
"""
Module for low-level audio input-output operations.
-Class summary
-=============
-
.. autosummary::
+ :toctree: generated/
- AudioSource
- Rewindable
- BufferAudioSource
- WaveAudioSource
- PyAudioSource
- StdinAudioSource
- PyAudioPlayer
-
+ AudioSource
+ Rewindable
+ BufferAudioSource
+ WaveAudioSource
+ PyAudioSource
+ StdinAudioSource
+ PyAudioPlayer
+ from_file
+ to_file
+ player_for
+"""
+import os
+import sys
+import wave
+import warnings
+from abc import ABC, abstractmethod
+from functools import partial
+from .exceptions import AudioIOError, AudioParameterError
-Function summary
-================
+try:
+ from pydub import AudioSegment
-.. autosummary::
+ _WITH_PYDUB = True
+except ImportError:
+ _WITH_PYDUB = False
- from_file
- player_for
-"""
+try:
+ from tqdm import tqdm as _tqdm
-from abc import ABCMeta, abstractmethod
-import wave
-import sys
+ DEFAULT_BAR_FORMAT_TQDM = "|" + "{bar}" + "|" + "[{elapsed}/{duration}]"
+ DEFAULT_NCOLS_TQDM = 30
+ DEFAULT_NCOLS_TQDM = 30
+ DEFAULT_MIN_INTERVAL_TQDM = 0.05
+ _WITH_TQDM = True
+except ImportError:
+ _WITH_TQDM = False
-__all__ = ["AudioSource", "Rewindable", "BufferAudioSource", "WaveAudioSource",
- "PyAudioSource", "StdinAudioSource", "PyAudioPlayer", "from_file", "player_for"]
-DEFAULT_SAMPLE_RATE = 16000
+__all__ = [
+ "AudioSource",
+ "Rewindable",
+ "BufferAudioSource",
+ "RawAudioSource",
+ "WaveAudioSource",
+ "PyAudioSource",
+ "StdinAudioSource",
+ "PyAudioPlayer",
+ "from_file",
+ "to_file",
+ "player_for",
+]
+
+DEFAULT_SAMPLING_RATE = 16000
DEFAULT_SAMPLE_WIDTH = 2
DEFAULT_NB_CHANNELS = 1
-class AudioSource():
- """
+def check_audio_data(data, sample_width, channels):
+ sample_size_bytes = int(sample_width * channels)
+ nb_samples = len(data) // sample_size_bytes
+ if nb_samples * sample_size_bytes != len(data):
+ raise AudioParameterError(
+ "The length of audio data must be an integer "
+ "multiple of `sample_width * channels`"
+ )
+
+
+def _guess_audio_format(fmt, filename):
+ if fmt is None:
+ extension = os.path.splitext(filename.lower())[1][1:]
+ if extension:
+ fmt = extension
+ else:
+ return None
+ fmt = fmt.lower()
+ if fmt == "wave":
+ fmt = "wav"
+ return fmt
+
+
+def _get_audio_parameters(param_dict):
+ """
+ Get audio parameters from a dictionary of parameters. An audio parameter can
+ have a long name or a short name. If the long name is present, the short
+ name will be ignored. If neither is present then `AudioParameterError` is
+ raised.
+
+ Expected parameters are:
+
+ - `sampling_rate`, `sr` : int, sampling rate.
+
+ - `sample_width`, `sw` : int, sample size in bytes.
+
+ - `channels`, `ch` : int, number of channels.
+
+ Returns
+ -------
+ audio_parameters : tuple
+ a tuple for audio parameters as (sampling_rate, sample_width, channels).
+ """
+ err_message = (
+ "'{ln}' (or '{sn}') must be a positive integer, found: '{val}'"
+ )
+ parameters = []
+ for (long_name, short_name) in (
+ ("sampling_rate", "sr"),
+ ("sample_width", "sw"),
+ ("channels", "ch"),
+ ):
+ param = param_dict.get(long_name, param_dict.get(short_name))
+ if param is None or not isinstance(param, int) or param <= 0:
+ raise AudioParameterError(
+ err_message.format(ln=long_name, sn=short_name, val=param)
+ )
+ parameters.append(param)
+ sampling_rate, sample_width, channels = parameters
+ return sampling_rate, sample_width, channels
+
+
+class AudioSource(ABC):
+ """
Base class for audio source objects.
-
- Subclasses should implement methods to open/close and audio stream
+
+ Subclasses should implement methods to open/close and audio stream
and read the desired amount of audio samples.
-
- :Parameters:
-
- `sampling_rate` : int
- Number of samples per second of audio stream. Default = 16000.
-
- `sample_width` : int
- Size in bytes of one audio sample. Possible values : 1, 2, 4.
- Default = 2.
-
- `channels` : int
- Number of channels of audio stream. The current version supports
- only mono audio streams (i.e. one channel).
- """
-
- __metaclass__ = ABCMeta
-
- def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE,
- sample_width = DEFAULT_SAMPLE_WIDTH,
- channels = DEFAULT_NB_CHANNELS):
-
- if not sample_width in (1, 2, 4):
- raise ValueError("Sample width must be one of: 1, 2 or 4 (bytes)")
-
- if channels != 1:
- raise ValueError("Only mono audio is currently handled")
-
- self.sampling_rate = sampling_rate
- self.sample_width = sample_width
- self.channels = channels
-
+
+ Parameters
+ ----------
+ sampling_rate : int
+ number of samples per second of audio data.
+ sample_width : int
+ size in bytes of one audio sample. Possible values: 1, 2 or 4.
+ channels : int
+ number of channels of audio data.
+ """
+
+ def __init__(
+ self, sampling_rate, sample_width, channels,
+ ):
+
+ if sample_width not in (1, 2, 4):
+ raise AudioParameterError(
+ "Sample width must be one of: 1, 2 or 4 (bytes)"
+ )
+
+ self._sampling_rate = sampling_rate
+ self._sample_width = sample_width
+ self._channels = channels
+
@abstractmethod
def is_open(self):
- """ Return True if audio source is open, False otherwise """
-
+ """Return True if audio source is open, False otherwise."""
+
@abstractmethod
def open(self):
- """ Open audio source """
-
+ """Open audio source."""
+
@abstractmethod
def close(self):
- """ Close audio source """
-
+ """Close audio source."""
+
@abstractmethod
def read(self, size):
"""
Read and return `size` audio samples at most.
-
- :Parameters:
-
- `size` : int
- the number of samples to read.
-
- :Returns:
-
- Audio data as a string of length 'N' * 'smaple_width' * 'channels', where 'N' is:
-
- - `size` if `size` < 'left_samples'
-
- - 'left_samples' if `size` > 'left_samples'
-
- """
-
- def get_sampling_rate(self):
- """ Return the number of samples per second of audio stream """
- return self.sampling_rate
-
- def get_sample_width(self):
- """ Return the number of bytes used to represent one audio sample """
- return self.sample_width
-
- def get_channels(self):
- """ Return the number of channels of this audio source """
+
+ Parameters
+ -----------
+ size : int
+ Number of samples to read.
+
+ Returns
+ -------
+ data : bytes
+ Audio data as a bytes object of length `N * sample_width * channels`
+ where `N` equals:
+
+ - `size` if `size` <= remaining samples
+
+ - remaining samples if `size` > remaining samples
+ """
+
+ @property
+ def sampling_rate(self):
+ """Number of samples per second of audio stream."""
+ return self._sampling_rate
+
+ @property
+ def sr(self):
+ """Number of samples per second of audio stream (alias for
+ `sampling_rate)`."""
+ return self._sampling_rate
+
+ @property
+ def sample_width(self):
+ """Number of bytes used to represent one audio sample."""
+ return self._sample_width
+
+ @property
+ def sw(self):
+ """Number of bytes used to represent one audio sample (alias for
+ `sample_width`)."""
+ return self._sample_width
+
+ @property
+ def channels(self):
+ """Number of channels in audio stream."""
+ return self._channels
+
+ @property
+ def ch(self):
+ """Number of channels in audio stream (alias for `channels`)."""
return self.channels
-
-class Rewindable():
+class Rewindable(AudioSource):
"""
Base class for rewindable audio streams.
- Subclasses should implement methods to return to the beginning of an
- audio stream as well as method to move to an absolute audio position
- expressed in time or in number of samples.
+
+ Subclasses should implement a method to return back to the start of an the
+ stream (`rewind`), as well as a property getter/setter named `position` that
+ reads/sets stream position expressed in number of samples.
"""
-
- __metaclass__ = ABCMeta
-
+
@abstractmethod
def rewind(self):
- """ Go back to the beginning of audio stream """
- pass
-
- @abstractmethod
- def get_position(self):
- """ Return the total number of already read samples """
-
- @abstractmethod
- def get_time_position(self):
- """ Return the total duration in seconds of already read data """
-
+ """Go back to the beginning of audio stream."""
+
+ @property
@abstractmethod
- def set_position(self, position):
- """ Move to an absolute position
-
- :Parameters:
-
- `position` : int
- number of samples to skip from the start of the stream
- """
-
+ def position(self):
+ """Return stream position in number of samples."""
+
+ @position.setter
@abstractmethod
- def set_time_position(self, time_position):
- """ Move to an absolute position expressed in seconds
-
- :Parameters:
-
- `time_position` : float
- seconds to skip from the start of the stream
- """
- pass
+ def position(self, position):
+ """Set stream position in number of samples."""
+
+ @property
+ def position_s(self):
+ """Return stream position in seconds."""
+ return self.position / self.sampling_rate
-
+ @position_s.setter
+ def position_s(self, position_s):
+ """Set stream position in seconds."""
+ self.position = int(self.sampling_rate * position_s)
-class BufferAudioSource(AudioSource, Rewindable):
+ @property
+ def position_ms(self):
+ """Return stream position in milliseconds."""
+ return (self.position * 1000) // self.sampling_rate
+
+ @position_ms.setter
+ def position_ms(self, position_ms):
+ """Set stream position in milliseconds."""
+ if not isinstance(position_ms, int):
+ raise ValueError("position_ms should be an int")
+ self.position = int(self.sampling_rate * position_ms / 1000)
+
+
+class BufferAudioSource(Rewindable):
"""
- An :class:`AudioSource` that encapsulates and reads data from a memory buffer.
- It implements methods from :class:`Rewindable` and is therefore a navigable :class:`AudioSource`.
+ An `AudioSource` that encapsulates and reads data from a memory buffer.
+
+ This class implements the `Rewindable` interface.
+ Parameters
+ ----------
+ data : bytes
+ audio data
+ sampling_rate : int, default: 16000
+ number of samples per second of audio data.
+ sample_width : int, default: 2
+ size in bytes of one audio sample. Possible values: 1, 2 or 4.
+ channels : int, default: 1
+ number of channels of audio data.
"""
-
- def __init__(self, data_buffer,
- sampling_rate = DEFAULT_SAMPLE_RATE,
- sample_width = DEFAULT_SAMPLE_WIDTH,
- channels = DEFAULT_NB_CHANNELS):
-
- if len(data_buffer) % (sample_width * channels) !=0:
- raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)")
-
+
+ def __init__(
+ self, data, sampling_rate=16000, sample_width=2, channels=1,
+ ):
AudioSource.__init__(self, sampling_rate, sample_width, channels)
- self._buffer = data_buffer
- self._index = 0
- self._left = 0 if self._buffer is None else len(self._buffer)
+ check_audio_data(data, sample_width, channels)
+ self._data = data
+ self._sample_size_all_channels = sample_width * channels
+ self._current_position_bytes = 0
self._is_open = False
-
+
def is_open(self):
return self._is_open
-
+
def open(self):
self._is_open = True
-
+
def close(self):
self._is_open = False
self.rewind()
-
+
def read(self, size):
if not self._is_open:
- raise IOError("Stream is not open")
-
- if self._left > 0:
-
- to_read = size * self.sample_width * self.channels
- if to_read > self._left:
- to_read = self._left
-
- data = self._buffer[self._index: self._index + to_read]
- self._index += to_read
- self._left -= to_read
-
+ raise AudioIOError("Stream is not open")
+ if size is None or size < 0:
+ offset = None
+ else:
+ bytes_to_read = self._sample_size_all_channels * size
+ offset = self._current_position_bytes + bytes_to_read
+ data = self._data[self._current_position_bytes : offset]
+ if data:
+ self._current_position_bytes += len(data)
return data
-
return None
-
- def get_data_buffer(self):
- """ Return all audio data as one string buffer. """
- return self._buffer
-
- def set_data(self, data_buffer):
- """ Set new data for this audio stream.
-
- :Parameters:
-
- `data_buffer` : str, basestring, Bytes
- a string buffer with a length multiple of (sample_width * channels)
- """
- if len(data_buffer) % (self.sample_width * self.channels) !=0:
- raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)")
- self._buffer = data_buffer
- self._index = 0
- self._left = 0 if self._buffer is None else len(self._buffer)
-
- def append_data(self, data_buffer):
- """ Append data to this audio stream
-
- :Parameters:
-
- `data_buffer` : str, basestring, Bytes
- a buffer with a length multiple of (sample_width * channels)
- """
-
- if len(data_buffer) % (self.sample_width * self.channels) !=0:
- raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)")
-
- self._buffer += data_buffer
- self._left += len(data_buffer)
-
-
+
+ @property
+ def data(self):
+ """Get raw audio data as a `bytes` object."""
+ return self._data
+
def rewind(self):
- self.set_position(0)
-
- def get_position(self):
- return self._index / self.sample_width
-
- def get_time_position(self):
- return float(self._index) / (self.sample_width * self.sampling_rate)
-
- def set_position(self, position):
- if position < 0:
- raise ValueError("position must be >= 0")
-
- if self._buffer is None:
- self._index = 0
- self._left = 0
- return
-
- position *= self.sample_width
- self._index = position if position < len(self._buffer) else len(self._buffer)
- self._left = len(self._buffer) - self._index
+ self.position = 0
+
+ @property
+ def position(self):
+ """Get stream position in number of samples"""
+ return self._current_position_bytes // self._sample_size_all_channels
+ @position.setter
+ def position(self, position):
+ """Set stream position in number of samples."""
+ position *= self._sample_size_all_channels
+ if position < 0:
+ position += len(self.data)
+ if position < 0 or position > len(self.data):
+ raise IndexError("Position out of range")
+ self._current_position_bytes = position
- def set_time_position(self, time_position): # time in seconds
- position = int(self.sampling_rate * time_position)
- self.set_position(position)
+ @property
+ def position_ms(self):
+ """Get stream position in milliseconds."""
+ return (self._current_position_bytes * 1000) // (
+ self._sample_size_all_channels * self.sampling_rate
+ )
+ @position_ms.setter
+ def position_ms(self, position_ms):
+ """Set stream position in milliseconds."""
+ if not isinstance(position_ms, int):
+ raise ValueError("position_ms should be an int")
+ self.position = int(self.sampling_rate * position_ms / 1000)
-class WaveAudioSource(AudioSource):
+class FileAudioSource(AudioSource):
"""
- A class for an `AudioSource` that reads data from a wave file.
-
- :Parameters:
-
- `filename` :
- path to a valid wave file
+ Base class `AudioSource`s that read audio data from a file.
+
+ Parameters
+ ----------
+ sampling_rate : int, default: 16000
+ number of samples per second of audio data.
+ sample_width : int, default: 2
+ size in bytes of one audio sample. Possible values: 1, 2 or 4.
+ channels : int, default: 1
+ number of channels of audio data.
"""
-
- def __init__(self, filename):
-
- self._filename = filename
+
+ def __init__(self, sampling_rate, sample_width, channels):
+ AudioSource.__init__(self, sampling_rate, sample_width, channels)
self._audio_stream = None
-
- stream = wave.open(self._filename)
- AudioSource.__init__(self, stream.getframerate(),
- stream.getsampwidth(),
- stream.getnchannels())
- stream.close()
-
-
+
+ def __del__(self):
+ if self.is_open():
+ self.close()
+
def is_open(self):
return self._audio_stream is not None
-
- def open(self):
- if(self._audio_stream is None):
- self._audio_stream = wave.open(self._filename)
-
-
+
def close(self):
if self._audio_stream is not None:
self._audio_stream.close()
self._audio_stream = None
-
-
+
+ @abstractmethod
+ def _read_from_stream(self, size):
+ """Read data from stream"""
+
def read(self, size):
+ if not self.is_open():
+ raise AudioIOError("Audio stream is not open")
+ data = self._read_from_stream(size)
+ if not data:
+ return None
+ return data
+
+
+class RawAudioSource(FileAudioSource):
+ """
+ A class for an `AudioSource` that reads data from a raw (headerless) audio
+ file.
+
+ This class should be used for large raw audio files to avoid loading the
+ whole data to memory.
+
+ Parameters
+ ----------
+ filename : str
+ path to a raw audio file.
+ sampling_rate : int
+ Number of samples per second of audio data.
+ sample_width : int
+ Size in bytes of one audio sample. Possible values : 1, 2, 4.
+ channels : int
+ Number of channels of audio data.
+ """
+
+ def __init__(self, file, sampling_rate, sample_width, channels):
+ FileAudioSource.__init__(self, sampling_rate, sample_width, channels)
+ self._file = file
+ self._audio_stream = None
+ self._sample_size = sample_width * channels
+
+ def open(self):
if self._audio_stream is None:
- raise IOError("Stream is not open")
+ self._audio_stream = open(self._file, "rb")
+
+ def _read_from_stream(self, size):
+ if size is None or size < 0:
+ bytes_to_read = None
else:
- data = self._audio_stream.readframes(size)
- if data is None or len(data) < 1:
- return None
- return data
+ bytes_to_read = size * self._sample_size
+ data = self._audio_stream.read(bytes_to_read)
+ return data
+
+
+class WaveAudioSource(FileAudioSource):
+ """
+ A class for an `AudioSource` that reads data from a wave file.
+
+ This class should be used for large wave files to avoid loading the whole
+ data to memory.
+
+ Parameters
+ ----------
+ filename : str
+ path to a valid wave file.
+ """
+
+ def __init__(self, filename):
+ self._filename = filename
+ self._audio_stream = None
+ stream = wave.open(self._filename, "rb")
+ FileAudioSource.__init__(
+ self,
+ stream.getframerate(),
+ stream.getsampwidth(),
+ stream.getnchannels(),
+ )
+ stream.close()
+
+ def open(self):
+ if self._audio_stream is None:
+ self._audio_stream = wave.open(self._filename)
+
+ def _read_from_stream(self, size):
+ if size is None or size < 0:
+ size = -1
+ return self._audio_stream.readframes(size)
class PyAudioSource(AudioSource):
"""
- A class for an `AudioSource` that reads data the built-in microphone using PyAudio.
+ A class for an `AudioSource` that reads data from built-in microphone using
+ PyAudio (https://people.csail.mit.edu/hubert/pyaudio/).
+
+ Parameters
+ ----------
+ sampling_rate : int, default: 16000
+ number of samples per second of audio data.
+ sample_width : int, default: 2
+ size in bytes of one audio sample. Possible values: 1, 2 or 4.
+ channels : int, default: 1
+ number of channels of audio data.
+ frames_per_buffer : int, default: 1024
+ PyAudio number of frames per buffer.
+ input_device_index: None or int, default: None
+ PyAudio index of audio device to read audio data from. If None default
+ device is used.
"""
-
- def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE,
- sample_width = DEFAULT_SAMPLE_WIDTH,
- channels = DEFAULT_NB_CHANNELS,
- frames_per_buffer = 1024):
-
-
+
+ def __init__(
+ self,
+ sampling_rate=16000,
+ sample_width=2,
+ channels=1,
+ frames_per_buffer=1024,
+ input_device_index=None,
+ ):
+
AudioSource.__init__(self, sampling_rate, sample_width, channels)
self._chunk_size = frames_per_buffer
-
+ self.input_device_index = input_device_index
+
import pyaudio
+
self._pyaudio_object = pyaudio.PyAudio()
- self._pyaudio_format = self._pyaudio_object.get_format_from_width(self.sample_width)
+ self._pyaudio_format = self._pyaudio_object.get_format_from_width(
+ self.sample_width
+ )
self._audio_stream = None
-
def is_open(self):
return self._audio_stream is not None
-
+
def open(self):
- self._audio_stream = self._pyaudio_object.open(format = self._pyaudio_format,
- channels = self.channels,
- rate = self.sampling_rate,
- input = True,
- output = False,
- frames_per_buffer = self._chunk_size)
-
-
+ self._audio_stream = self._pyaudio_object.open(
+ format=self._pyaudio_format,
+ channels=self.channels,
+ rate=self.sampling_rate,
+ input=True,
+ output=False,
+ input_device_index=self.input_device_index,
+ frames_per_buffer=self._chunk_size,
+ )
+
def close(self):
if self._audio_stream is not None:
self._audio_stream.stop_stream()
self._audio_stream.close()
self._audio_stream = None
-
-
+
def read(self, size):
if self._audio_stream is None:
raise IOError("Stream is not open")
-
if self._audio_stream.is_active():
data = self._audio_stream.read(size)
if data is None or len(data) < 1:
return None
return data
-
return None
-
-class StdinAudioSource(AudioSource):
+
+class StdinAudioSource(FileAudioSource):
"""
- A class for an :class:`AudioSource` that reads data from standard input.
+ A class for an `AudioSource` that reads data from standard input.
+
+ Parameters
+ ----------
+ sampling_rate : int, default: 16000
+ number of samples per second of audio data.
+ sample_width : int, default: 2
+ size in bytes of one audio sample. Possible values: 1, 2 or 4.
+ channels : int, default: 1
+ number of channels of audio data.
"""
-
- def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE,
- sample_width = DEFAULT_SAMPLE_WIDTH,
- channels = DEFAULT_NB_CHANNELS):
-
- AudioSource.__init__(self, sampling_rate, sample_width, channels)
+
+ def __init__(
+ self, sampling_rate=16000, sample_width=2, channels=1,
+ ):
+ FileAudioSource.__init__(self, sampling_rate, sample_width, channels)
self._is_open = False
-
-
+ self._sample_size = sample_width * channels
+ self._stream = sys.stdin.buffer
+
def is_open(self):
return self._is_open
-
+
def open(self):
self._is_open = True
-
+
def close(self):
self._is_open = False
-
- def read(self, size):
- if not self._is_open:
- raise IOError("Stream is not open")
-
- to_read = size * self.sample_width * self.channels
- data = sys.stdin.read(to_read)
-
- if data is None or len(data) < 1:
- return None
-
- return data
-
-
-class PyAudioPlayer():
+
+ def _read_from_stream(self, size):
+ bytes_to_read = size * self._sample_size
+ data = self._stream.read(bytes_to_read)
+ if data:
+ return data
+ return None
+
+
+def _make_tqdm_progress_bar(iterable, total, duration, **tqdm_kwargs):
+ fmt = tqdm_kwargs.get("bar_format", DEFAULT_BAR_FORMAT_TQDM)
+ fmt = fmt.replace("{duration}", "{:.3f}".format(duration))
+ tqdm_kwargs["bar_format"] = fmt
+
+ tqdm_kwargs["ncols"] = tqdm_kwargs.get("ncols", DEFAULT_NCOLS_TQDM)
+ tqdm_kwargs["mininterval"] = tqdm_kwargs.get(
+ "mininterval", DEFAULT_MIN_INTERVAL_TQDM
+ )
+ return _tqdm(iterable, total=total, **tqdm_kwargs)
+
+
+class PyAudioPlayer:
"""
A class for audio playback using Pyaudio
+ (https://people.csail.mit.edu/hubert/pyaudio/).
+
+ Parameters
+ ----------
+ sampling_rate : int, default: 16000
+ number of samples per second of audio data.
+ sample_width : int, default: 2
+ size in bytes of one audio sample. Possible values: 1, 2 or 4.
+ channels : int, default: 1
+ number of channels of audio data.
"""
-
- def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE,
- sample_width = DEFAULT_SAMPLE_WIDTH,
- channels = DEFAULT_NB_CHANNELS):
- if not sample_width in (1, 2, 4):
- raise ValueError("Sample width must be one of: 1, 2 or 4 (bytes)")
-
+
+ def __init__(
+ self, sampling_rate=16000, sample_width=2, channels=1,
+ ):
+ if sample_width not in (1, 2, 4):
+ raise ValueError("Sample width in bytes must be one of 1, 2 or 4")
+
self.sampling_rate = sampling_rate
self.sample_width = sample_width
self.channels = channels
-
+
import pyaudio
+
self._p = pyaudio.PyAudio()
- self.stream = self._p.open(format = self._p.get_format_from_width(self.sample_width),
- channels = self.channels, rate = self.sampling_rate,
- input = False, output = True)
-
- def play(self, data):
+ self.stream = self._p.open(
+ format=self._p.get_format_from_width(self.sample_width),
+ channels=self.channels,
+ rate=self.sampling_rate,
+ input=False,
+ output=True,
+ )
+
+ def play(self, data, progress_bar=False, **progress_bar_kwargs):
+ chunk_gen, nb_chunks = self._chunk_data(data)
+ if progress_bar and _WITH_TQDM:
+ duration = len(data) / (
+ self.sampling_rate * self.sample_width * self.channels
+ )
+ chunk_gen = _make_tqdm_progress_bar(
+ chunk_gen,
+ total=nb_chunks,
+ duration=duration,
+ **progress_bar_kwargs
+ )
if self.stream.is_stopped():
self.stream.start_stream()
-
- for chunk in self._chunk_data(data):
- self.stream.write(chunk)
-
+ try:
+ for chunk in chunk_gen:
+ self.stream.write(chunk)
+ except KeyboardInterrupt:
+ pass
self.stream.stop_stream()
-
- def stop(self):
+
+ def stop(self):
if not self.stream.is_stopped():
self.stream.stop_stream()
self.stream.close()
self._p.terminate()
-
+
def _chunk_data(self, data):
# make audio chunks of 100 ms to allow interruption (like ctrl+c)
- chunk_size = int((self.sampling_rate * self.sample_width * self.channels) / 10)
- start = 0
- while start < len(data):
- yield data[start : start + chunk_size]
- start += chunk_size
-
-
-def from_file(filename):
- """
- Create an `AudioSource` object using the audio file specified by `filename`.
- The appropriate :class:`AudioSource` class is guessed from file's extension.
-
- :Parameters:
-
- `filename` :
- path to an audio file.
-
- :Returns:
-
- an `AudioSource` object that reads data from the given file.
-
- """
-
- if filename.lower().endswith(".wav"):
- return WaveAudioSource(filename)
-
- raise Exception("Can not create an AudioSource object from '%s'" %(filename))
-
-
-def player_for(audio_source):
- """
- Return a :class:`PyAudioPlayer` that can play data from `audio_source`.
-
- :Parameters:
-
- `audio_source` :
- an `AudioSource` object.
-
- :Returns:
-
- `PyAudioPlayer` that has the same sampling rate, sample width and number of channels
- as `audio_source`.
- """
-
- return PyAudioPlayer(audio_source.get_sampling_rate(),
- audio_source.get_sample_width(),
- audio_source.get_channels())
-
-
+ bytes_1_sec = self.sampling_rate * self.sample_width * self.channels
+ chunk_size = bytes_1_sec // 10
+ # make sure chunk_size is a multiple of sample_width * channels
+ chunk_size -= chunk_size % (self.sample_width * self.channels)
+ nb_chunks, rest = divmod(len(data), chunk_size)
+ if rest > 0:
+ nb_chunks += 1
+ chunk_gen = (
+ data[i : i + chunk_size] for i in range(0, len(data), chunk_size)
+ )
+ return chunk_gen, nb_chunks
+
+
+def player_for(source):
+ """
+ Return an `AudioPlayer` compatible with `source` (i.e., has the same
+ sampling rate, sample width and number of channels).
+
+ Parameters
+ ----------
+ source : AudioSource
+ An object that has `sampling_rate`, `sample_width` and `sample_width`
+ attributes.
+
+ Returns
+ -------
+ player : PyAudioPlayer
+ An audio player that has the same sampling rate, sample width
+ and number of channels as `source`.
+ """
+ return PyAudioPlayer(
+ source.sampling_rate, source.sample_width, source.channels
+ )
+
+
+def get_audio_source(input=None, **kwargs):
+ """
+ Create and return an AudioSource from input.
+
+ Parameters
+ ----------
+ input : str, bytes, "-" or None (default)
+ source to read audio data from. If `str`, it should be a path to a valid
+ audio file. If `bytes`, it is used as raw audio data. If it is "-",
+ raw data will be read from stdin. If None, read audio data from the
+ microphone using PyAudio.
+ kwargs
+ audio parameters used to build the `AudioSource` object. Depending on
+ the nature of `input`, theses may be omitted (e.g., when `input` is an
+ audio file in a popular audio format such as wav, ogg, flac, etc.) or
+ include parameters such as `sampling_rate`, `sample_width`, `channels`
+ (or their respective short name versions `sr`, `sw` and `ch`) if `input`
+ is a path to a raw (headerless) audio file, a bytes object for raw audio
+ data or None (to read data from built-in microphone). See the respective
+ `AudioSource` classes from more information about possible parameters.
+
+ Returns
+ -------
+ source : AudioSource
+ audio source created from input parameters
+ """
+ if input == "-":
+ return StdinAudioSource(*_get_audio_parameters(kwargs))
+ if isinstance(input, bytes):
+ return BufferAudioSource(input, *_get_audio_parameters(kwargs))
+
+ # read data from a file
+ if input is not None:
+ return from_file(filename=input, **kwargs)
+
+ # read data from microphone via pyaudio
+ else:
+ frames_per_buffer = kwargs.get("frames_per_buffer", 1024)
+ input_device_index = kwargs.get("input_device_index")
+ return PyAudioSource(
+ *_get_audio_parameters(kwargs),
+ frames_per_buffer=frames_per_buffer,
+ input_device_index=input_device_index
+ )
+
+
+def _load_raw(file, sampling_rate, sample_width, channels, large_file=False):
+ """
+ Load a raw audio file with standard Python. If `large_file` is True, return
+ a `RawAudioSource` object that reads data lazily from disk, otherwise load
+ all data to memory and return a `BufferAudioSource` object.
+
+ Parameters
+ ----------
+ file : str
+ path to a raw audio data file.
+ sampling_rate : int
+ sampling rate of audio data.
+ sample_width : int
+ size in bytes of one audio sample.
+ channels : int
+ number of channels of audio data.
+ large_file : bool
+ if True, return a `RawAudioSource` otherwise a `BufferAudioSource`
+ object.
+
+ Returns
+ -------
+ source : RawAudioSource or BufferAudioSource
+ an `AudioSource` that reads data from input file.
+ """
+ if None in (sampling_rate, sample_width, channels):
+ raise AudioParameterError(
+ "All audio parameters are required for raw audio files"
+ )
+
+ if large_file:
+ return RawAudioSource(
+ file,
+ sampling_rate=sampling_rate,
+ sample_width=sample_width,
+ channels=channels,
+ )
+
+ with open(file, "rb") as fp:
+ data = fp.read()
+ return BufferAudioSource(
+ data,
+ sampling_rate=sampling_rate,
+ sample_width=sample_width,
+ channels=channels,
+ )
+
+
+def _load_wave(file, large_file=False):
+ """
+ Load a wave audio file with standard Python. If `large_file` is True, return
+ a `WaveAudioSource` object that reads data lazily from disk, otherwise load
+ all data to memory and return a `BufferAudioSource` object.
+
+ Parameters
+ ----------
+ file : str
+ path to a wav audio data file
+ large_file : bool
+ if True, return a `WaveAudioSource` otherwise a `BufferAudioSource`
+ object.
+
+ Returns
+ -------
+ source : WaveAudioSource or BufferAudioSource
+ an `AudioSource` that reads data from input file.
+ """
+ if large_file:
+ return WaveAudioSource(file)
+ with wave.open(file) as fp:
+ channels = fp.getnchannels()
+ srate = fp.getframerate()
+ swidth = fp.getsampwidth()
+ data = fp.readframes(-1)
+ return BufferAudioSource(
+ data, sampling_rate=srate, sample_width=swidth, channels=channels
+ )
+
+
+def _load_with_pydub(file, audio_format=None):
+ """
+ Open compressed audio or video file using pydub. If a video file
+ is passed, its audio track(s) are extracted and loaded.
+
+ Parameters
+ ----------
+ file : str
+ path to audio file.
+ audio_format : str, default: None
+ string, audio/video file format if known (e.g. raw, webm, wav, ogg)
+
+ Returns
+ -------
+ source : BufferAudioSource
+ an `AudioSource` that reads data from input file.
+ """
+ func_dict = {
+ "mp3": AudioSegment.from_mp3,
+ "ogg": AudioSegment.from_ogg,
+ "flv": AudioSegment.from_flv,
+ }
+ open_function = func_dict.get(audio_format, AudioSegment.from_file)
+ segment = open_function(file)
+ return BufferAudioSource(
+ data=segment.raw_data,
+ sampling_rate=segment.frame_rate,
+ sample_width=segment.sample_width,
+ channels=segment.channels,
+ )
+
+
+def from_file(filename, audio_format=None, large_file=False, **kwargs):
+ """
+ Read audio data from `filename` and return an `AudioSource` object.
+ if `audio_format` is None, the appropriate `AudioSource` class is guessed
+ from file's extension. `filename` can be a compressed audio or video file.
+ This will require installing `pydub` (https://github.com/jiaaro/pydub).
+
+ The normal behavior is to load all audio data to memory from which a
+ :class:`BufferAudioSource` object is created. This should be convenient
+ most of the time unless audio file is very large. In that case, and
+ in order to load audio data in lazy manner (i.e. read data from disk each
+ time :func:`AudioSource.read` is called), `large_file` should be True.
+
+ Note that the current implementation supports only wave and raw formats for
+ lazy audio loading.
+
+ If an audio format is `raw`, the following keyword arguments are required:
+
+ - `sampling_rate`, `sr`: int, sampling rate of audio data.
+ - `sample_width`, `sw`: int, size in bytes of one audio sample.
+ - `channels`, `ch`: int, number of channels of audio data.
+
+ See also
+ --------
+ :func:`to_file`.
+
+ Parameters
+ ----------
+ filename : str
+ path to input audio or video file.
+ audio_format : str
+ audio format used to save data (e.g. raw, webm, wav, ogg).
+ large_file : bool, default: False
+ if True, audio won't fully be loaded to memory but only when a window
+ is read from disk.
+
+
+ Other Parameters
+ ----------------
+ sampling_rate, sr: int
+ sampling rate of audio data
+ sample_width : int
+ sample width (i.e. number of bytes used to represent one audio sample)
+ channels : int
+ number of channels of audio data
+
+ Returns
+ -------
+ audio_source : AudioSource
+ an :class:`AudioSource` object that reads data from input file.
+
+ Raises
+ ------
+ `AudioIOError`
+ raised if audio data cannot be read in the given
+ format or if `format` is `raw` and one or more audio parameters are missing.
+ """
+ audio_format = _guess_audio_format(audio_format, filename)
+
+ if audio_format == "raw":
+ srate, swidth, channels = _get_audio_parameters(kwargs)
+ return _load_raw(filename, srate, swidth, channels, large_file)
+
+ if audio_format in ["wav", "wave"]:
+ return _load_wave(filename, large_file)
+ if large_file:
+ err_msg = "if 'large_file` is True file format should be raw or wav"
+ raise AudioIOError(err_msg)
+ if _WITH_PYDUB:
+ return _load_with_pydub(filename, audio_format=audio_format)
+ else:
+ raise AudioIOError(
+ "pydub is required for audio formats other than raw or wav"
+ )
+
+
+def _save_raw(data, file):
+ """
+ Saves audio data as a headerless (i.e. raw) file.
+ See also :func:`to_file`.
+ """
+ with open(file, "wb") as fp:
+ fp.write(data)
+
+
+def _save_wave(data, file, sampling_rate, sample_width, channels):
+ """
+ Saves audio data to a wave file.
+ See also :func:`to_file`.
+ """
+ if None in (sampling_rate, sample_width, channels):
+ raise AudioParameterError(
+ "All audio parameters are required to save wave audio files"
+ )
+ with wave.open(file, "w") as fp:
+ fp.setframerate(sampling_rate)
+ fp.setsampwidth(sample_width)
+ fp.setnchannels(channels)
+ fp.writeframes(data)
+
+
+def _save_with_pydub(
+ data, file, audio_format, sampling_rate, sample_width, channels
+):
+ """
+ Saves audio data with pydub (https://github.com/jiaaro/pydub).
+ See also :func:`to_file`.
+ """
+ segment = AudioSegment(
+ data,
+ frame_rate=sampling_rate,
+ sample_width=sample_width,
+ channels=channels,
+ )
+ with open(file, "wb") as fp:
+ segment.export(fp, format=audio_format)
+
+
+def to_file(data, file, audio_format=None, **kwargs):
+ """
+ Writes audio data to file. If `audio_format` is `None`, output
+ audio format will be guessed from extension. If `audio_format`
+ is `None` and `file` comes without an extension then audio
+ data will be written as a raw audio file.
+
+ Parameters
+ ----------
+ data : bytes-like
+ audio data to be written. Can be a `bytes`, `bytearray`,
+ `memoryview`, `array` or `numpy.ndarray` object.
+ file : str
+ path to output audio file.
+ audio_format : str
+ audio format used to save data (e.g. raw, webm, wav, ogg)
+ kwargs: dict
+ If an audio format other than `raw` is used, the following keyword
+ arguments are required:
+
+ - `sampling_rate`, `sr`: int, sampling rate of audio data.
+ - `sample_width`, `sw`: int, size in bytes of one audio sample.
+ - `channels`, `ch`: int, number of channels of audio data.
+
+ Raises
+ ------
+ `AudioParameterError` if output format is different than raw and one or more
+ audio parameters are missing. `AudioIOError` if audio data cannot be written
+ in the desired format.
+ """
+ audio_format = _guess_audio_format(audio_format, file)
+ if audio_format in (None, "raw"):
+ _save_raw(data, file)
+ return
+ try:
+ sampling_rate, sample_width, channels = _get_audio_parameters(kwargs)
+ except AudioParameterError as exc:
+ err_message = "All audio parameters are required to save formats "
+ "other than raw. Error detail: {}".format(exc)
+ raise AudioParameterError(err_message)
+ if audio_format in ("wav", "wave"):
+ _save_wave(data, file, sampling_rate, sample_width, channels)
+ elif _WITH_PYDUB:
+ _save_with_pydub(
+ data, file, audio_format, sampling_rate, sample_width, channels
+ )
+ else:
+ err_message = "cannot write file format {} (file name: {})"
+ raise AudioIOError(err_message.format(audio_format, file))
diff --git a/libs/auditok/plotting.py b/libs/auditok/plotting.py
new file mode 100755
index 000000000..eca5877f4
--- /dev/null
+++ b/libs/auditok/plotting.py
@@ -0,0 +1,150 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+AUDITOK_PLOT_THEME = {
+ "figure": {"facecolor": "#482a36", "alpha": 0.2},
+ "plot": {"facecolor": "#282a36"},
+ "energy_threshold": {
+ "color": "#e31f8f",
+ "linestyle": "--",
+ "linewidth": 1,
+ },
+ "signal": {"color": "#40d970", "linestyle": "-", "linewidth": 1},
+ "detections": {
+ "facecolor": "#777777",
+ "edgecolor": "#ff8c1a",
+ "linewidth": 1,
+ "alpha": 0.75,
+ },
+}
+
+
+def _make_time_axis(nb_samples, sampling_rate):
+ sample_duration = 1 / sampling_rate
+ x = np.linspace(0, sample_duration * (nb_samples - 1), nb_samples)
+ return x
+
+
+def _plot_line(x, y, theme, xlabel=None, ylabel=None, **kwargs):
+ color = theme.get("color", theme.get("c"))
+ ls = theme.get("linestyle", theme.get("ls"))
+ lw = theme.get("linewidth", theme.get("lw"))
+ plt.plot(x, y, c=color, ls=ls, lw=lw, **kwargs)
+ plt.xlabel(xlabel, fontsize=8)
+ plt.ylabel(ylabel, fontsize=8)
+
+
+def _plot_detections(subplot, detections, theme):
+ fc = theme.get("facecolor", theme.get("fc"))
+ ec = theme.get("edgecolor", theme.get("ec"))
+ ls = theme.get("linestyle", theme.get("ls"))
+ lw = theme.get("linewidth", theme.get("lw"))
+ alpha = theme.get("alpha")
+ for (start, end) in detections:
+ subplot.axvspan(start, end, fc=fc, ec=ec, ls=ls, lw=lw, alpha=alpha)
+
+
+def plot(
+ audio_region,
+ scale_signal=True,
+ detections=None,
+ energy_threshold=None,
+ show=True,
+ figsize=None,
+ save_as=None,
+ dpi=120,
+ theme="auditok",
+):
+ y = np.asarray(audio_region)
+ if len(y.shape) == 1:
+ y = y.reshape(1, -1)
+ nb_subplots, nb_samples = y.shape
+ sampling_rate = audio_region.sampling_rate
+ time_axis = _make_time_axis(nb_samples, sampling_rate)
+ if energy_threshold is not None:
+ eth_log10 = energy_threshold * np.log(10) / 10
+ amplitude_threshold = np.sqrt(np.exp(eth_log10))
+ else:
+ amplitude_threshold = None
+ if detections is None:
+ detections = []
+ else:
+ # End of detection corresponds to the end of the last sample but
+ # to stay compatible with the time axis of signal plotting we want end
+ # of detection to correspond to the *start* of the that last sample.
+ detections = [
+ (start, end - (1 / sampling_rate)) for (start, end) in detections
+ ]
+ if theme == "auditok":
+ theme = AUDITOK_PLOT_THEME
+
+ fig = plt.figure(figsize=figsize, dpi=dpi)
+ fig_theme = theme.get("figure", theme.get("fig", {}))
+ fig_fc = fig_theme.get("facecolor", fig_theme.get("ffc"))
+ fig_alpha = fig_theme.get("alpha", 1)
+ fig.patch.set_facecolor(fig_fc)
+ fig.patch.set_alpha(fig_alpha)
+
+ plot_theme = theme.get("plot", {})
+ plot_fc = plot_theme.get("facecolor", plot_theme.get("pfc"))
+
+ if nb_subplots > 2 and nb_subplots % 2 == 0:
+ nb_rows = nb_subplots // 2
+ nb_columns = 2
+ else:
+ nb_rows = nb_subplots
+ nb_columns = 1
+
+ for sid, samples in enumerate(y, 1):
+ ax = fig.add_subplot(nb_rows, nb_columns, sid)
+ ax.set_facecolor(plot_fc)
+ if scale_signal:
+ std = samples.std()
+ if std > 0:
+ mean = samples.mean()
+ std = samples.std()
+ samples = (samples - mean) / std
+ max_ = samples.max()
+ plt.ylim(-1.5 * max_, 1.5 * max_)
+ if amplitude_threshold is not None:
+ if scale_signal and std > 0:
+ amp_th = (amplitude_threshold - mean) / std
+ else:
+ amp_th = amplitude_threshold
+ eth_theme = theme.get("energy_threshold", theme.get("eth", {}))
+ _plot_line(
+ [time_axis[0], time_axis[-1]],
+ [amp_th] * 2,
+ eth_theme,
+ label="Detection threshold",
+ )
+ if sid == 1:
+ legend = plt.legend(
+ ["Detection threshold"],
+ facecolor=fig_fc,
+ framealpha=0.1,
+ bbox_to_anchor=(0.0, 1.15, 1.0, 0.102),
+ loc=2,
+ )
+ legend = plt.gca().add_artist(legend)
+
+ signal_theme = theme.get("signal", {})
+ _plot_line(
+ time_axis,
+ samples,
+ signal_theme,
+ xlabel="Time (seconds)",
+ ylabel="Signal{}".format(" (scaled)" if scale_signal else ""),
+ )
+ detections_theme = theme.get("detections", {})
+ _plot_detections(ax, detections, detections_theme)
+ plt.title("Channel {}".format(sid), fontsize=10)
+
+ plt.xticks(fontsize=8)
+ plt.yticks(fontsize=8)
+ plt.tight_layout()
+
+ if save_as is not None:
+ plt.savefig(save_as, dpi=dpi)
+ if show:
+ plt.show()
diff --git a/libs/auditok/signal.py b/libs/auditok/signal.py
new file mode 100644
index 000000000..3f00fb9e5
--- /dev/null
+++ b/libs/auditok/signal.py
@@ -0,0 +1,179 @@
+"""
+Module for basic audio signal processing and array operations.
+
+.. autosummary::
+ :toctree: generated/
+
+ to_array
+ extract_single_channel
+ compute_average_channel
+ compute_average_channel_stereo
+ separate_channels
+ calculate_energy_single_channel
+ calculate_energy_multichannel
+"""
+from array import array as array_
+import audioop
+import math
+
+FORMAT = {1: "b", 2: "h", 4: "i"}
+_EPSILON = 1e-10
+
+
+def to_array(data, sample_width, channels):
+ """Extract individual channels of audio data and return a list of arrays of
+ numeric samples. This will always return a list of `array.array` objects
+ (one per channel) even if audio data is mono.
+
+ Parameters
+ ----------
+ data : bytes
+ raw audio data.
+ sample_width : int
+ size in bytes of one audio sample (one channel considered).
+
+ Returns
+ -------
+ samples_arrays : list
+ list of arrays of audio samples.
+ """
+ fmt = FORMAT[sample_width]
+ if channels == 1:
+ return [array_(fmt, data)]
+ return separate_channels(data, fmt, channels)
+
+
+def extract_single_channel(data, fmt, channels, selected):
+ samples = array_(fmt, data)
+ return samples[selected::channels]
+
+
+def compute_average_channel(data, fmt, channels):
+ """
+ Compute and return average channel of multi-channel audio data. If the
+ number of channels is 2, use :func:`compute_average_channel_stereo` (much
+ faster). This function uses satandard `array` module to convert `bytes` data
+ into an array of numeric values.
+
+ Parameters
+ ----------
+ data : bytes
+ multi-channel audio data to mix down.
+ fmt : str
+ format (single character) to pass to `array.array` to convert `data`
+ into an array of samples. This should be "b" if audio data's sample width
+ is 1, "h" if it's 2 and "i" if it's 4.
+ channels : int
+ number of channels of audio data.
+
+ Returns
+ -------
+ mono_audio : bytes
+ mixed down audio data.
+ """
+ all_channels = array_(fmt, data)
+ mono_channels = [
+ array_(fmt, all_channels[ch::channels]) for ch in range(channels)
+ ]
+ avg_arr = array_(
+ fmt,
+ (round(sum(samples) / channels) for samples in zip(*mono_channels)),
+ )
+ return avg_arr
+
+
+def compute_average_channel_stereo(data, sample_width):
+ """Compute and return average channel of stereo audio data. This function
+ should be used when the number of channels is exactly 2 because in that
+ case we can use standard `audioop` module which *much* faster then calling
+ :func:`compute_average_channel`.
+
+ Parameters
+ ----------
+ data : bytes
+ 2-channel audio data to mix down.
+ sample_width : int
+ size in bytes of one audio sample (one channel considered).
+
+ Returns
+ -------
+ mono_audio : bytes
+ mixed down audio data.
+ """
+ fmt = FORMAT[sample_width]
+ arr = array_(fmt, audioop.tomono(data, sample_width, 0.5, 0.5))
+ return arr
+
+
+def separate_channels(data, fmt, channels):
+ """Create a list of arrays of audio samples (`array.array` objects), one for
+ each channel.
+
+ Parameters
+ ----------
+ data : bytes
+ multi-channel audio data to mix down.
+ fmt : str
+ format (single character) to pass to `array.array` to convert `data`
+ into an array of samples. This should be "b" if audio data's sample width
+ is 1, "h" if it's 2 and "i" if it's 4.
+ channels : int
+ number of channels of audio data.
+
+ Returns
+ -------
+ channels_arr : list
+ list of audio channels, each as a standard `array.array`.
+ """
+ all_channels = array_(fmt, data)
+ mono_channels = [
+ array_(fmt, all_channels[ch::channels]) for ch in range(channels)
+ ]
+ return mono_channels
+
+
+def calculate_energy_single_channel(data, sample_width):
+ """Calculate the energy of mono audio data. Energy is computed as:
+
+ .. math:: energy = 20 \log(\sqrt({1}/{N}\sum_{i}^{N}{a_i}^2)) % # noqa: W605
+
+ where `a_i` is the i-th audio sample and `N` is the number of audio samples
+ in data.
+
+ Parameters
+ ----------
+ data : bytes
+ single-channel audio data.
+ sample_width : int
+ size in bytes of one audio sample.
+
+ Returns
+ -------
+ energy : float
+ energy of audio signal.
+ """
+ energy_sqrt = max(audioop.rms(data, sample_width), _EPSILON)
+ return 20 * math.log10(energy_sqrt)
+
+
+def calculate_energy_multichannel(x, sample_width, aggregation_fn=max):
+ """Calculate the energy of multi-channel audio data. Energy is calculated
+ channel-wise. An aggregation function is applied to the resulting energies
+ (default: `max`). Also see :func:`calculate_energy_single_channel`.
+
+ Parameters
+ ----------
+ data : bytes
+ single-channel audio data.
+ sample_width : int
+ size in bytes of one audio sample (one channel considered).
+ aggregation_fn : callable, default: max
+ aggregation function to apply to the resulting per-channel energies.
+
+ Returns
+ -------
+ energy : float
+ aggregated energy of multi-channel audio signal.
+ """
+ energies = (calculate_energy_single_channel(xi, sample_width) for xi in x)
+ return aggregation_fn(energies)
diff --git a/libs/auditok/signal_numpy.py b/libs/auditok/signal_numpy.py
new file mode 100644
index 000000000..bf5425197
--- /dev/null
+++ b/libs/auditok/signal_numpy.py
@@ -0,0 +1,30 @@
+import numpy as np
+from .signal import (
+ compute_average_channel_stereo,
+ calculate_energy_single_channel,
+ calculate_energy_multichannel,
+)
+
+FORMAT = {1: np.int8, 2: np.int16, 4: np.int32}
+
+
+def to_array(data, sample_width, channels):
+ fmt = FORMAT[sample_width]
+ if channels == 1:
+ return np.frombuffer(data, dtype=fmt).astype(np.float64)
+ return separate_channels(data, fmt, channels).astype(np.float64)
+
+
+def extract_single_channel(data, fmt, channels, selected):
+ samples = np.frombuffer(data, dtype=fmt)
+ return np.asanyarray(samples[selected::channels], order="C")
+
+
+def compute_average_channel(data, fmt, channels):
+ array = np.frombuffer(data, dtype=fmt).astype(np.float64)
+ return array.reshape(-1, channels).mean(axis=1).round().astype(fmt)
+
+
+def separate_channels(data, fmt, channels):
+ array = np.frombuffer(data, dtype=fmt)
+ return np.asanyarray(array.reshape(-1, channels).T, order="C")
diff --git a/libs/auditok/util.py b/libs/auditok/util.py
index d46a8899c..f29eb9bf3 100644
--- a/libs/auditok/util.py
+++ b/libs/auditok/util.py
@@ -1,448 +1,624 @@
"""
-Class summary
-=============
-
.. autosummary::
+ :toctree: generated/
- DataSource
- StringDataSource
- ADSFactory
- ADSFactory.AudioDataSource
- ADSFactory.ADSDecorator
- ADSFactory.OverlapADS
- ADSFactory.LimiterADS
- ADSFactory.RecorderADS
- DataValidator
- AudioEnergyValidator
-
+ AudioEnergyValidator
+ AudioReader
+ Recorder
+ make_duration_formatter
+ make_channel_selector
"""
+from abc import ABC, abstractmethod
+import warnings
+from functools import partial
+from .io import (
+ AudioIOError,
+ AudioSource,
+ from_file,
+ BufferAudioSource,
+ PyAudioSource,
+ get_audio_source,
+)
+from .exceptions import (
+ DuplicateArgument,
+ TooSamllBlockDuration,
+ TimeFormatError,
+)
+try:
+ from . import signal_numpy as signal
+except ImportError:
+ from . import signal
-from abc import ABCMeta, abstractmethod
-import math
-from array import array
-from .io import Rewindable, from_file, BufferAudioSource, PyAudioSource
-from .exceptions import DuplicateArgument
-import sys
+__all__ = [
+ "make_duration_formatter",
+ "make_channel_selector",
+ "DataSource",
+ "DataValidator",
+ "StringDataSource",
+ "ADSFactory",
+ "AudioDataSource",
+ "AudioReader",
+ "Recorder",
+ "AudioEnergyValidator",
+]
-try:
- import numpy
- _WITH_NUMPY = True
-except ImportError as e:
- _WITH_NUMPY = False
-
-try:
- from builtins import str
- basestring = str
-except ImportError as e:
- if sys.version_info >= (3, 0):
- basestring = str
-
-
-
-__all__ = ["DataSource", "DataValidator", "StringDataSource", "ADSFactory", "AudioEnergyValidator"]
-
-
-class DataSource():
+
+def make_duration_formatter(fmt):
+ """
+ Make and return a function used to format durations in seconds. Accepted
+ format directives are:
+
+ - ``%S`` : absolute number of seconds with 3 decimals. This direction should
+ be used alone.
+ - ``%i`` : milliseconds
+ - ``%s`` : seconds
+ - ``%m`` : minutes
+ - ``%h`` : hours
+
+ These last 4 directives should all be specified. They can be placed anywhere
+ in the input string.
+
+ Parameters
+ ----------
+ fmt : str
+ duration format.
+
+ Returns
+ -------
+ formatter : callable
+ a function that takes a duration in seconds (float) and returns a string
+ that corresponds to that duration.
+
+ Raises
+ ------
+ TimeFormatError
+ if the format contains an unknown directive.
+
+ Examples
+ --------
+
+ Using ``%S``:
+
+ .. code:: python
+
+ formatter = make_duration_formatter("%S")
+ formatter(123.589)
+ '123.589'
+ formatter(123)
+ '123.000'
+
+ Using the other directives:
+
+ .. code:: python
+
+ formatter = make_duration_formatter("%h:%m:%s.%i")
+ formatter(3600+120+3.25)
+ '01:02:03.250'
+
+ formatter = make_duration_formatter("%h hrs, %m min, %s sec and %i ms")
+ formatter(3600+120+3.25)
+ '01 hrs, 02 min, 03 sec and 250 ms'
+
+ # omitting one of the 4 directives might result in a wrong duration
+ formatter = make_duration_formatter("%m min, %s sec and %i ms")
+ formatter(3600+120+3.25)
+ '02 min, 03 sec and 250 ms'
+ """
+ if fmt == "%S":
+
+ def fromatter(seconds):
+ return "{:.3f}".format(seconds)
+
+ elif fmt == "%I":
+
+ def fromatter(seconds):
+ return "{0}".format(int(seconds * 1000))
+
+ else:
+ fmt = fmt.replace("%h", "{hrs:02d}")
+ fmt = fmt.replace("%m", "{mins:02d}")
+ fmt = fmt.replace("%s", "{secs:02d}")
+ fmt = fmt.replace("%i", "{millis:03d}")
+ try:
+ i = fmt.index("%")
+ raise TimeFormatError(
+ "Unknown time format directive '{0}'".format(fmt[i : i + 2])
+ )
+ except ValueError:
+ pass
+
+ def fromatter(seconds):
+ millis = int(seconds * 1000)
+ hrs, millis = divmod(millis, 3600000)
+ mins, millis = divmod(millis, 60000)
+ secs, millis = divmod(millis, 1000)
+ return fmt.format(hrs=hrs, mins=mins, secs=secs, millis=millis)
+
+ return fromatter
+
+
+def make_channel_selector(sample_width, channels, selected=None):
+ """Create and return a callable used for audio channel selection. The
+ returned selector can be used as `selector(audio_data)` and returns data
+ that contains selected channel only.
+
+ Importantly, if `selected` is None or equals "any", `selector(audio_data)`
+ will separate and return a list of available channels:
+ `[data_channe_1, data_channe_2, ...].`
+
+ Note also that returned `selector` expects `bytes` format for input data but
+ does notnecessarily return a `bytes` object. In fact, in order to extract
+ the desired channel (or compute the average channel if `selected` = "avg"),
+ it first converts input data into a `array.array` (or `numpy.ndarray`)
+ object. After channel of interst is selected/computed, it is returned as
+ such, without any reconversion to `bytes`. This behavior is wanted for
+ efficiency purposes because returned objects can be directly used as buffers
+ of bytes. In any case, returned objects can be converted back to `bytes`
+ using `bytes(obj)`.
+
+ Exception to this is the special case where `channels` = 1 in which input
+ data is returned without any processing.
+
+
+ Parameters
+ ----------
+ sample_width : int
+ number of bytes used to encode one audio sample, should be 1, 2 or 4.
+ channels : int
+ number of channels of raw audio data that the returned selector should
+ expect.
+ selected : int or str, default: None
+ audio channel to select and return when calling `selector(raw_data)`. It
+ should be an int >= `-channels` and < `channels`. If one of "mix",
+ "avg" or "average" is passed then `selector` will return the average
+ channel of audio data. If None or "any", return a list of all available
+ channels at each call.
+
+ Returns
+ -------
+ selector : callable
+ a callable that can be used as `selector(audio_data)` and returns data
+ that contains channel of interst.
+
+ Raises
+ ------
+ ValueError
+ if `sample_width` is not one of 1, 2 or 4, or if `selected` has an
+ unexpected value.
"""
- Base class for objects passed to :func:`auditok.core.StreamTokenizer.tokenize`.
+ fmt = signal.FORMAT.get(sample_width)
+ if fmt is None:
+ err_msg = "'sample_width' must be 1, 2 or 4, given: {}"
+ raise ValueError(err_msg.format(sample_width))
+ if channels == 1:
+ return lambda x: x
+
+ if isinstance(selected, int):
+ if selected < 0:
+ selected += channels
+ if selected < 0 or selected >= channels:
+ err_msg = "Selected channel must be >= -channels and < channels"
+ err_msg += ", given: {}"
+ raise ValueError(err_msg.format(selected))
+ return partial(
+ signal.extract_single_channel,
+ fmt=fmt,
+ channels=channels,
+ selected=selected,
+ )
+
+ if selected in ("mix", "avg", "average"):
+ if channels == 2:
+ # when data is stereo, using audioop when possible is much faster
+ return partial(
+ signal.compute_average_channel_stereo,
+ sample_width=sample_width,
+ )
+
+ return partial(
+ signal.compute_average_channel, fmt=fmt, channels=channels
+ )
+
+ if selected in (None, "any"):
+ return partial(signal.separate_channels, fmt=fmt, channels=channels)
+
+ raise ValueError(
+ "Selected channel must be an integer, None (alias 'any') or 'average' "
+ "(alias 'avg' or 'mix')"
+ )
+
+
+class DataSource(ABC):
+ """
+ Base class for objects passed to :func:`StreamTokenizer.tokenize`.
Subclasses should implement a :func:`DataSource.read` method.
"""
- __metaclass__ = ABCMeta
-
+
@abstractmethod
def read(self):
"""
- Read a piece of data read from this source.
+ Read a block (i.e., window) of data read from this source.
If no more data is available, return None.
"""
-
-
-class DataValidator():
+
+
+class DataValidator(ABC):
"""
- Base class for a validator object used by :class:`.core.StreamTokenizer` to check
- if read data is valid.
+ Base class for a validator object used by :class:`.core.StreamTokenizer`
+ to check if read data is valid.
Subclasses should implement :func:`is_valid` method.
"""
- __metaclass__ = ABCMeta
-
+
@abstractmethod
def is_valid(self, data):
"""
Check whether `data` is valid
"""
+
+class AudioEnergyValidator(DataValidator):
+ """
+ A validator based on audio signal energy. For an input window of `N` audio
+ samples (see :func:`AudioEnergyValidator.is_valid`), the energy is computed
+ as:
+
+ .. math:: energy = 20 \log(\sqrt({1}/{N}\sum_{i}^{N}{a_i}^2)) % # noqa: W605
+
+ where `a_i` is the i-th audio sample.
+
+ Parameters
+ ----------
+ energy_threshold : float
+ minimum energy that audio window should have to be valid.
+ sample_width : int
+ size in bytes of one audio sample.
+ channels : int
+ number of channels of audio data.
+ use_channel : {None, "any", "mix", "avg", "average"} or int
+ channel to use for energy computation. The following values are
+ accepted:
+
+ - None (alias "any") : compute energy for each of the channels and return
+ the maximum value.
+ - "mix" (alias "avg" or "average") : compute the average channel then
+ compute its energy.
+ - int (>= 0 , < `channels`) : compute the energy of the specified channel
+ and ignore the other ones.
+
+ Returns
+ -------
+ energy : float
+ energy of the audio window.
+ """
+
+ def __init__(
+ self, energy_threshold, sample_width, channels, use_channel=None
+ ):
+ self._sample_width = sample_width
+ self._selector = make_channel_selector(
+ sample_width, channels, use_channel
+ )
+ if channels == 1 or use_channel not in (None, "any"):
+ self._energy_fn = signal.calculate_energy_single_channel
+ else:
+ self._energy_fn = signal.calculate_energy_multichannel
+ self._energy_threshold = energy_threshold
+
+ def is_valid(self, data):
+ """
+
+ Parameters
+ ----------
+ data : bytes-like
+ array of raw audio data
+
+ Returns
+ -------
+ bool
+ True if the energy of audio data is >= threshold, False otherwise.
+ """
+ log_energy = self._energy_fn(self._selector(data), self._sample_width)
+ return log_energy >= self._energy_threshold
+
+
class StringDataSource(DataSource):
"""
- A class that represent a :class:`DataSource` as a string buffer.
- Each call to :func:`DataSource.read` returns on character and moves one step forward.
- If the end of the buffer is reached, :func:`read` returns None.
-
- :Parameters:
-
- `data` :
- a basestring object.
-
+ Class that represent a :class:`DataSource` as a string buffer.
+ Each call to :func:`DataSource.read` returns on character and moves one
+ step forward. If the end of the buffer is reached, :func:`read` returns
+ None.
+
+ Parameters
+ ----------
+ data : str
+ a string object used as data.
+
"""
-
+
def __init__(self, data):
self._data = None
self._current = 0
self.set_data(data)
-
-
+
def read(self):
"""
Read one character from buffer.
-
- :Returns:
-
- Current character or None if end of buffer is reached
+
+ Returns
+ -------
+ char : str
+ current character or None if end of buffer is reached.
"""
-
+
if self._current >= len(self._data):
return None
self._current += 1
return self._data[self._current - 1]
-
+
def set_data(self, data):
"""
Set a new data buffer.
-
- :Parameters:
-
- `data` : a basestring object
- New data buffer.
+
+ Parameters
+ ----------
+ data : str
+ new data buffer.
"""
-
- if not isinstance(data, basestring):
- raise ValueError("data must an instance of basestring")
+
+ if not isinstance(data, str):
+ raise ValueError("data must an instance of str")
self._data = data
self._current = 0
-
class ADSFactory:
"""
- Factory class that makes it easy to create an :class:`ADSFactory.AudioDataSource` object that implements
- :class:`DataSource` and can therefore be passed to :func:`auditok.core.StreamTokenizer.tokenize`.
-
- Whether you read audio data from a file, the microphone or a memory buffer, this factory
- instantiates and returns the right :class:`ADSFactory.AudioDataSource` object.
-
- There are many other features you want your :class:`ADSFactory.AudioDataSource` object to have, such as:
- memorize all read audio data so that you can rewind and reuse it (especially useful when
- reading data from the microphone), read a fixed amount of data (also useful when reading
- from the microphone), read overlapping audio frames (often needed when dosing a spectral
- analysis of data).
-
- :func:`ADSFactory.ads` automatically creates and return object with the desired behavior according
- to the supplied keyword arguments.
-
+ .. deprecated:: 2.0.0
+ `ADSFactory` will be removed in auditok 2.0.1, use instances of
+ :class:`AudioReader` instead.
+
+ Factory class that makes it easy to create an
+ :class:`AudioDataSource` object that implements
+ :class:`DataSource` and can therefore be passed to
+ :func:`auditok.core.StreamTokenizer.tokenize`.
+
+ Whether you read audio data from a file, the microphone or a memory buffer,
+ this factory instantiates and returns the right
+ :class:`AudioDataSource` object.
+
+ There are many other features you want a :class:`AudioDataSource` object to
+ have, such as: memorize all read audio data so that you can rewind and reuse
+ it (especially useful when reading data from the microphone), read a fixed
+ amount of data (also useful when reading from the microphone), read
+ overlapping audio frames (often needed when dosing a spectral analysis of
+ data).
+
+ :func:`ADSFactory.ads` automatically creates and return object with the
+ desired behavior according to the supplied keyword arguments.
"""
-
- @staticmethod
+
+ @staticmethod # noqa: C901
def _check_normalize_args(kwargs):
-
+
for k in kwargs:
- if not k in ["block_dur", "hop_dur", "block_size", "hop_size", "max_time", "record",
- "audio_source", "filename", "data_buffer", "frames_per_buffer", "sampling_rate",
- "sample_width", "channels", "sr", "sw", "ch", "asrc", "fn", "fpb", "db", "mt",
- "rec", "bd", "hd", "bs", "hs"]:
+ if k not in [
+ "block_dur",
+ "hop_dur",
+ "block_size",
+ "hop_size",
+ "max_time",
+ "record",
+ "audio_source",
+ "filename",
+ "data_buffer",
+ "frames_per_buffer",
+ "sampling_rate",
+ "sample_width",
+ "channels",
+ "sr",
+ "sw",
+ "ch",
+ "asrc",
+ "fn",
+ "fpb",
+ "db",
+ "mt",
+ "rec",
+ "bd",
+ "hd",
+ "bs",
+ "hs",
+ ]:
raise ValueError("Invalid argument: {0}".format(k))
-
+
if "block_dur" in kwargs and "bd" in kwargs:
- raise DuplicateArgument("Either 'block_dur' or 'bd' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'block_dur' or 'bd' must be specified, not both"
+ )
+
if "hop_dur" in kwargs and "hd" in kwargs:
- raise DuplicateArgument("Either 'hop_dur' or 'hd' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'hop_dur' or 'hd' must be specified, not both"
+ )
+
if "block_size" in kwargs and "bs" in kwargs:
- raise DuplicateArgument("Either 'block_size' or 'bs' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'block_size' or 'bs' must be specified, not both"
+ )
+
if "hop_size" in kwargs and "hs" in kwargs:
- raise DuplicateArgument("Either 'hop_size' or 'hs' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'hop_size' or 'hs' must be specified, not both"
+ )
+
if "max_time" in kwargs and "mt" in kwargs:
- raise DuplicateArgument("Either 'max_time' or 'mt' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'max_time' or 'mt' must be specified, not both"
+ )
+
if "audio_source" in kwargs and "asrc" in kwargs:
- raise DuplicateArgument("Either 'audio_source' or 'asrc' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'audio_source' or 'asrc' must be specified, not both"
+ )
+
if "filename" in kwargs and "fn" in kwargs:
- raise DuplicateArgument("Either 'filename' or 'fn' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'filename' or 'fn' must be specified, not both"
+ )
+
if "data_buffer" in kwargs and "db" in kwargs:
- raise DuplicateArgument("Either 'filename' or 'db' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'filename' or 'db' must be specified, not both"
+ )
+
if "frames_per_buffer" in kwargs and "fbb" in kwargs:
- raise DuplicateArgument("Either 'frames_per_buffer' or 'fpb' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'frames_per_buffer' or 'fpb' must be specified, not "
+ "both"
+ )
+
if "sampling_rate" in kwargs and "sr" in kwargs:
- raise DuplicateArgument("Either 'sampling_rate' or 'sr' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'sampling_rate' or 'sr' must be specified, not both"
+ )
+
if "sample_width" in kwargs and "sw" in kwargs:
- raise DuplicateArgument("Either 'sample_width' or 'sw' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'sample_width' or 'sw' must be specified, not both"
+ )
+
if "channels" in kwargs and "ch" in kwargs:
- raise DuplicateArgument("Either 'channels' or 'ch' must be specified, not both")
-
+ raise DuplicateArgument(
+ "Either 'channels' or 'ch' must be specified, not both"
+ )
+
if "record" in kwargs and "rec" in kwargs:
- raise DuplicateArgument("Either 'record' or 'rec' must be specified, not both")
-
-
+ raise DuplicateArgument(
+ "Either 'record' or 'rec' must be specified, not both"
+ )
+
kwargs["bd"] = kwargs.pop("block_dur", None) or kwargs.pop("bd", None)
kwargs["hd"] = kwargs.pop("hop_dur", None) or kwargs.pop("hd", None)
kwargs["bs"] = kwargs.pop("block_size", None) or kwargs.pop("bs", None)
kwargs["hs"] = kwargs.pop("hop_size", None) or kwargs.pop("hs", None)
kwargs["mt"] = kwargs.pop("max_time", None) or kwargs.pop("mt", None)
- kwargs["asrc"] = kwargs.pop("audio_source", None) or kwargs.pop("asrc", None)
+ kwargs["asrc"] = kwargs.pop("audio_source", None) or kwargs.pop(
+ "asrc", None
+ )
kwargs["fn"] = kwargs.pop("filename", None) or kwargs.pop("fn", None)
kwargs["db"] = kwargs.pop("data_buffer", None) or kwargs.pop("db", None)
-
+
record = kwargs.pop("record", False)
if not record:
record = kwargs.pop("rec", False)
if not isinstance(record, bool):
raise TypeError("'record' must be a boolean")
-
+
kwargs["rec"] = record
-
- # keep long names for arguments meant for BufferAudioSource and PyAudioSource
+
+ # keep long names for arguments meant for BufferAudioSource
+ # and PyAudioSource
if "frames_per_buffer" in kwargs or "fpb" in kwargs:
- kwargs["frames_per_buffer"] = kwargs.pop("frames_per_buffer", None) or kwargs.pop("fpb", None)
-
+ kwargs["frames_per_buffer"] = kwargs.pop(
+ "frames_per_buffer", None
+ ) or kwargs.pop("fpb", None)
+
if "sampling_rate" in kwargs or "sr" in kwargs:
- kwargs["sampling_rate"] = kwargs.pop("sampling_rate", None) or kwargs.pop("sr", None)
-
- if "sample_width" in kwargs or "sw" in kwargs:
- kwargs["sample_width"] = kwargs.pop("sample_width", None) or kwargs.pop("sw", None)
-
+ kwargs["sampling_rate"] = kwargs.pop(
+ "sampling_rate", None
+ ) or kwargs.pop("sr", None)
+
+ if "sample_width" in kwargs or "sw" in kwargs:
+ kwargs["sample_width"] = kwargs.pop(
+ "sample_width", None
+ ) or kwargs.pop("sw", None)
+
if "channels" in kwargs or "ch" in kwargs:
- kwargs["channels"] = kwargs.pop("channels", None) or kwargs.pop("ch", None)
-
-
-
-
-
-
-
+ kwargs["channels"] = kwargs.pop("channels", None) or kwargs.pop(
+ "ch", None
+ )
+
@staticmethod
def ads(**kwargs):
-
"""
- Create an return an :class:`ADSFactory.AudioDataSource`. The type and behavior of the object is the result
- of the supplied parameters.
-
- :Parameters:
-
- *No parameters* :
- read audio data from the available built-in microphone with the default parameters.
- The returned :class:`ADSFactory.AudioDataSource` encapsulate an :class:`io.PyAudioSource` object and hence
- it accepts the next four parameters are passed to use instead of their default values.
-
- `sampling_rate`, `sr` : *(int)*
- number of samples per second. Default = 16000.
-
- `sample_width`, `sw` : *(int)*
- number of bytes per sample (must be in (1, 2, 4)). Default = 2
-
- `channels`, `ch` : *(int)*
- number of audio channels. Default = 1 (only this value is currently accepted)
-
- `frames_per_buffer`, `fpb` : *(int)*
- number of samples of PyAudio buffer. Default = 1024.
-
- `audio_source`, `asrc` : an `AudioSource` object
- read data from this audio source
-
- `filename`, `fn` : *(string)*
- build an `io.AudioSource` object using this file (currently only wave format is supported)
-
- `data_buffer`, `db` : *(string)*
- build an `io.BufferAudioSource` using data in `data_buffer`. If this keyword is used,
- `sampling_rate`, `sample_width` and `channels` are passed to `io.BufferAudioSource`
- constructor and used instead of default values.
-
- `max_time`, `mt` : *(float)*
- maximum time (in seconds) to read. Default behavior: read until there is no more data
- available.
-
- `record`, `rec` : *(bool)*
- save all read data in cache. Provide a navigable object which boasts a `rewind` method.
- Default = False.
-
- `block_dur`, `bd` : *(float)*
- processing block duration in seconds. This represents the quantity of audio data to return
- each time the :func:`read` method is invoked. If `block_dur` is 0.025 (i.e. 25 ms) and the sampling
- rate is 8000 and the sample width is 2 bytes, :func:`read` returns a buffer of 0.025 * 8000 * 2 = 400
- bytes at most. This parameter will be looked for (and used if available) before `block_size`.
- If neither parameter is given, `block_dur` will be set to 0.01 second (i.e. 10 ms)
-
-
- `hop_dur`, `hd` : *(float)*
- quantity of data to skip from current processing window. if `hop_dur` is supplied then there
- will be an overlap of `block_dur` - `hop_dur` between two adjacent blocks. This
- parameter will be looked for (and used if available) before `hop_size`. If neither parameter
- is given, `hop_dur` will be set to `block_dur` which means that there will be no overlap
- between two consecutively read blocks.
-
- `block_size`, `bs` : *(int)*
- number of samples to read each time the `read` method is called. Default: a block size
- that represents a window of 10ms, so for a sampling rate of 16000, the default `block_size`
- is 160 samples, for a rate of 44100, `block_size` = 441 samples, etc.
-
- `hop_size`, `hs` : *(int)*
- determines the number of overlapping samples between two adjacent read windows. For a
- `hop_size` of value *N*, the overlap is `block_size` - *N*. Default : `hop_size` = `block_size`,
- means that there is no overlap.
-
- :Returns:
-
- An AudioDataSource object that has the desired features.
-
- :Exampels:
-
- 1. **Create an AudioDataSource that reads data from the microphone (requires Pyaudio) with default audio parameters:**
-
- .. code:: python
-
- from auditok import ADSFactory
- ads = ADSFactory.ads()
- ads.get_sampling_rate()
- 16000
- ads.get_sample_width()
- 2
- ads.get_channels()
- 1
-
-
- 2. **Create an AudioDataSource that reads data from the microphone with a sampling rate of 48KHz:**
-
- .. code:: python
-
- from auditok import ADSFactory
- ads = ADSFactory.ads(sr=48000)
- ads.get_sampling_rate()
- 48000
-
- 3. **Create an AudioDataSource that reads data from a wave file:**
-
- .. code:: python
-
- import auditok
- from auditok import ADSFactory
- ads = ADSFactory.ads(fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence)
- ads.get_sampling_rate()
- 44100
- ads.get_sample_width()
- 2
- ads.get_channels()
- 1
-
- 4. **Define size of read blocks as 20 ms**
-
- .. code:: python
-
- import auditok
- from auditok import ADSFactory
- '''
- we know samling rate for previous file is 44100 samples/second
- so 10 ms are equivalent to 441 samples and 20 ms to 882
- '''
- block_size = 882
- ads = ADSFactory.ads(bs = 882, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence)
- ads.open()
- # read one block
- data = ads.read()
- ads.close()
- len(data)
- 1764
- assert len(data) == ads.get_sample_width() * block_size
-
- 5. **Define block size as a duration (use block_dur or bd):**
-
- .. code:: python
-
- import auditok
- from auditok import ADSFactory
- dur = 0.25 # second
- ads = ADSFactory.ads(bd = dur, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence)
- '''
- we know samling rate for previous file is 44100 samples/second
- for a block duration of 250 ms, block size should be 0.25 * 44100 = 11025
- '''
- ads.get_block_size()
- 11025
- assert ads.get_block_size() == int(0.25 * 44100)
- ads.open()
- # read one block
- data = ads.read()
- ads.close()
- len(data)
- 22050
- assert len(data) == ads.get_sample_width() * ads.get_block_size()
-
- 6. **Read overlapping blocks (one of hope_size, hs, hop_dur or hd > 0):**
-
- For better readability we'd better use :class:`auditok.io.BufferAudioSource` with a string buffer:
-
- .. code:: python
-
- import auditok
- from auditok import ADSFactory
- '''
- we supply a data beffer instead of a file (keyword 'bata_buffer' or 'db')
- sr : sampling rate = 16 samples/sec
- sw : sample width = 1 byte
- ch : channels = 1
- '''
- buffer = "abcdefghijklmnop" # 16 bytes = 1 second of data
- bd = 0.250 # block duration = 250 ms = 4 bytes
- hd = 0.125 # hop duration = 125 ms = 2 bytes
- ads = ADSFactory.ads(db = "abcdefghijklmnop", bd = bd, hd = hd, sr = 16, sw = 1, ch = 1)
- ads.open()
- ads.read()
- 'abcd'
- ads.read()
- 'cdef'
- ads.read()
- 'efgh'
- ads.read()
- 'ghij'
- data = ads.read()
- assert data == 'ijkl'
-
- 7. **Limit amount of read data (use max_time or mt):**
-
- .. code:: python
-
- '''
- We know audio file is larger than 2.25 seconds
- We want to read up to 2.25 seconds of audio data
- '''
- ads = ADSFactory.ads(mt = 2.25, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence)
- ads.open()
- data = []
- while True:
- d = ads.read()
- if d is None:
- break
- data.append(d)
-
- ads.close()
- data = b''.join(data)
- assert len(data) == int(ads.get_sampling_rate() * 2.25 * ads.get_sample_width() * ads.get_channels())
+ Create an return an :class:`AudioDataSource`. The type and
+ behavior of the object is the result
+ of the supplied parameters. Called without any parameters, the class
+ will read audio data from the available built-in microphone with the
+ default parameters.
+
+ Parameters
+ ----------
+ sampling_rate, sr : int, default: 16000
+ number of audio samples per second of input audio stream.
+ sample_width, sw : int, default: 2
+ number of bytes per sample, must be one of 1, 2 or 4
+ channels, ch : int, default: 1
+ number of audio channels, only a value of 1 is currently accepted.
+ frames_per_buffer, fpb : int, default: 1024
+ number of samples of PyAudio buffer.
+ audio_source, asrc : `AudioSource`
+ `AudioSource` to read data from
+ filename, fn : str
+ create an `AudioSource` object using this file
+ data_buffer, db : str
+ build an `io.BufferAudioSource` using data in `data_buffer`.
+ If this keyword is used,
+ `sampling_rate`, `sample_width` and `channels` are passed to
+ `io.BufferAudioSource` constructor and used instead of default
+ values.
+ max_time, mt : float
+ maximum time (in seconds) to read. Default behavior: read until
+ there is no more data
+ available.
+ record, rec : bool, default = False
+ save all read data in cache. Provide a navigable object which has a
+ `rewind` method.
+ block_dur, bd : float
+ processing block duration in seconds. This represents the quantity
+ of audio data to return each time the :func:`read` method is
+ invoked. If `block_dur` is 0.025 (i.e. 25 ms) and the sampling rate
+ is 8000 and the sample width is 2 bytes, :func:`read` returns a
+ buffer of 0.025 * 8000 * 2 = 400 bytes at most. This parameter will
+ be looked for (and used if available) before `block_size`. If
+ neither parameter is given, `block_dur` will be set to 0.01 second
+ (i.e. 10 ms)
+ hop_dur, hd : float
+ quantity of data to skip from current processing window. if
+ `hop_dur` is supplied then there will be an overlap of `block_dur`
+ - `hop_dur` between two adjacent blocks. This parameter will be
+ looked for (and used if available) before `hop_size`.
+ If neither parameter is given, `hop_dur` will be set to `block_dur`
+ which means that there will be no overlap between two consecutively
+ read blocks.
+ block_size, bs : int
+ number of samples to read each time the `read` method is called.
+ Default: a block size that represents a window of 10ms, so for a
+ sampling rate of 16000, the default `block_size` is 160 samples,
+ for a rate of 44100, `block_size` = 441 samples, etc.
+ hop_size, hs : int
+ determines the number of overlapping samples between two adjacent
+ read windows. For a `hop_size` of value *N*, the overlap is
+ `block_size` - *N*. Default : `hop_size` = `block_size`, means that
+ there is no overlap.
+
+ Returns
+ -------
+ audio_data_source : AudioDataSource
+ an `AudioDataSource` object build with input parameters.
"""
-
- # copy user's dicionary (shallow copy)
- kwargs = kwargs.copy()
-
+ warnings.warn(
+ "'ADSFactory' is deprecated and will be removed in a future "
+ "release. Please use AudioReader class instead.",
+ DeprecationWarning,
+ )
+
# check and normalize keyword arguments
ADSFactory._check_normalize_args(kwargs)
-
+
block_dur = kwargs.pop("bd")
hop_dur = kwargs.pop("hd")
block_size = kwargs.pop("bs")
@@ -452,431 +628,483 @@ class ADSFactory:
filename = kwargs.pop("fn")
data_buffer = kwargs.pop("db")
record = kwargs.pop("rec")
-
+
# Case 1: an audio source is supplied
if audio_source is not None:
if (filename, data_buffer) != (None, None):
- raise Warning("You should provide one of 'audio_source', 'filename' or 'data_buffer'\
- keyword parameters. 'audio_source' will be used")
-
+ raise Warning(
+ "You should provide one of 'audio_source', 'filename' or \
+ 'data_buffer' keyword parameters. 'audio_source' will be \
+ used"
+ )
+
# Case 2: a file name is supplied
elif filename is not None:
if data_buffer is not None:
- raise Warning("You should provide one of 'filename' or 'data_buffer'\
- keyword parameters. 'filename' will be used")
+ raise Warning(
+ "You should provide one of 'filename' or 'data_buffer'\
+ keyword parameters. 'filename' will be used"
+ )
audio_source = from_file(filename)
-
- # Case 3: a data_buffer is supplied
+
+ # Case 3: a data_buffer is supplied
elif data_buffer is not None:
- audio_source = BufferAudioSource(data_buffer = data_buffer, **kwargs)
-
+ audio_source = BufferAudioSource(data=data_buffer, **kwargs)
+
# Case 4: try to access native audio input
else:
audio_source = PyAudioSource(**kwargs)
-
-
+
if block_dur is not None:
if block_size is not None:
- raise DuplicateArgument("Either 'block_dur' or 'block_size' can be specified, not both")
- else:
- block_size = int(audio_source.get_sampling_rate() * block_dur)
- elif block_size is None:
- # Set default block_size to 10 ms
- block_size = int(audio_source.get_sampling_rate() / 100)
-
- # Instantiate base AudioDataSource
- ads = ADSFactory.AudioDataSource(audio_source=audio_source, block_size=block_size)
-
- # Limit data to be read
- if max_time is not None:
- ads = ADSFactory.LimiterADS(ads=ads, max_time=max_time)
-
- # Record, rewind and reuse data
- if record:
- ads = ADSFactory.RecorderADS(ads=ads)
-
+ raise DuplicateArgument(
+ "Either 'block_dur' or 'block_size' can be specified, not \
+ both"
+ )
+ elif block_size is not None:
+ block_dur = block_size / audio_source.sr
+ else:
+ block_dur = 0.01 # 10 ms
+
# Read overlapping blocks of data
if hop_dur is not None:
if hop_size is not None:
- raise DuplicateArgument("Either 'hop_dur' or 'hop_size' can be specified, not both")
- else:
- hop_size = int(audio_source.get_sampling_rate() * hop_dur)
-
- if hop_size is not None:
- if hop_size <= 0 or hop_size > block_size:
- raise ValueError("hop_size must be > 0 and <= block_size")
- if hop_size < block_size:
- ads = ADSFactory.OverlapADS(ads=ads, hop_size=hop_size)
-
+ raise DuplicateArgument(
+ "Either 'hop_dur' or 'hop_size' can be specified, not both"
+ )
+ elif hop_size is not None:
+ hop_dur = hop_size / audio_source.sr
+
+ ads = AudioDataSource(
+ audio_source,
+ block_dur=block_dur,
+ hop_dur=hop_dur,
+ record=record,
+ max_read=max_time,
+ )
return ads
-
-
- class AudioDataSource(DataSource):
- """
- Base class for AudioDataSource objects.
- It inherits from DataSource and encapsulates an AudioSource object.
- """
-
- def __init__(self, audio_source, block_size):
-
- self.audio_source = audio_source
- self.block_size = block_size
-
- def get_block_size(self):
- return self.block_size
-
- def set_block_size(self, size):
- self.block_size = size
-
- def get_audio_source(self):
- return self.audio_source
-
- def set_audio_source(self, audio_source):
- self.audio_source = audio_source
-
- def open(self):
- self.audio_source.open()
-
- def close(self):
- self.audio_source.close()
-
- def is_open(self):
- return self.audio_source.is_open()
-
- def get_sampling_rate(self):
- return self.audio_source.get_sampling_rate()
-
- def get_sample_width(self):
- return self.audio_source.get_sample_width()
-
- def get_channels(self):
- return self.audio_source.get_channels()
-
-
- def rewind(self):
- if isinstance(self.audio_source, Rewindable):
- self.audio_source.rewind()
- else:
- raise Exception("Audio source is not rewindable")
-
-
-
- def is_rewindable(self):
- return isinstance(self.audio_source, Rewindable)
-
-
- def read(self):
- return self.audio_source.read(self.block_size)
-
-
- class ADSDecorator(AudioDataSource):
- """
- Base decorator class for AudioDataSource objects.
- """
- __metaclass__ = ABCMeta
-
- def __init__(self, ads):
- self.ads = ads
-
- self.get_block_size = self.ads.get_block_size
- self.set_block_size = self.ads.set_block_size
- self.get_audio_source = self.ads.get_audio_source
- self.open = self.ads.open
- self.close = self.ads.close
- self.is_open = self.ads.is_open
- self.get_sampling_rate = self.ads.get_sampling_rate
- self.get_sample_width = self.ads.get_sample_width
- self.get_channels = self.ads.get_channels
-
- def is_rewindable(self):
- return self.ads.is_rewindable
-
- def rewind(self):
- self.ads.rewind()
- self._reinit()
-
- def set_audio_source(self, audio_source):
- self.ads.set_audio_source(audio_source)
- self._reinit()
-
- def open(self):
- if not self.ads.is_open():
- self.ads.open()
- self._reinit()
-
- @abstractmethod
- def _reinit(self):
- pass
-
-
- class OverlapADS(ADSDecorator):
- """
- A class for AudioDataSource objects that can read and return overlapping audio frames
- """
-
- def __init__(self, ads, hop_size):
- ADSFactory.ADSDecorator.__init__(self, ads)
-
- if hop_size <= 0 or hop_size > self.get_block_size():
- raise ValueError("hop_size must be either 'None' or \
- between 1 and block_size (both inclusive)")
- self.hop_size = hop_size
- self._actual_block_size = self.get_block_size()
- self._reinit()
-
-
- def _get_block_size():
- return self._actual_block_size
-
-
- def _read_first_block(self):
- # For the first call, we need an entire block of size 'block_size'
- block = self.ads.read()
- if block is None:
- return None
-
- # Keep a slice of data in cache and append it in the next call
- if len(block) > self._hop_size_bytes:
- self._cache = block[self._hop_size_bytes:]
-
- # Up from the next call, we will use '_read_next_blocks'
- # and we only read 'hop_size'
- self.ads.set_block_size(self.hop_size)
- self.read = self._read_next_blocks
-
- return block
-
- def _read_next_blocks(self):
- block = self.ads.read()
- if block is None:
- return None
-
- # Append block to cache data to ensure overlap
- block = self._cache + block
- # Keep a slice of data in cache only if we have a full length block
- # if we don't that means that this is the last block
- if len(block) == self._block_size_bytes:
- self._cache = block[self._hop_size_bytes:]
- else:
- self._cache = None
-
- return block
- def read(self):
- pass
-
- def _reinit(self):
+
+class _AudioReadingProxy:
+ def __init__(self, audio_source):
+
+ self._audio_source = audio_source
+
+ def rewind(self):
+ if self.rewindable:
+ self._audio_source.rewind()
+ else:
+ raise AudioIOError("Audio stream is not rewindable")
+
+ def rewindable(self):
+ try:
+ return self._audio_source.rewindable
+ except AttributeError:
+ return False
+
+ def is_open(self):
+ return self._audio_source.is_open()
+
+ def open(self):
+ self._audio_source.open()
+
+ def close(self):
+ self._audio_source.close()
+
+ def read(self, size):
+ return self._audio_source.read(size)
+
+ @property
+ def data(self):
+ err_msg = "This AudioReader is not a recorder, no recorded data can "
+ err_msg += "be retrieved"
+ raise AttributeError(err_msg)
+
+ def __getattr__(self, name):
+ return getattr(self._audio_source, name)
+
+
+class _Recorder(_AudioReadingProxy):
+ """
+ Class for `AudioReader` objects that can record all data they read. Useful
+ when reading data from microphone.
+ """
+
+ def __init__(self, audio_source):
+ super(_Recorder, self).__init__(audio_source)
+ self._cache = []
+ self._read_block = self._read_and_cache
+ self._read_from_cache = False
+ self._data = None
+
+ def read(self, size):
+ return self._read_block(size)
+
+ @property
+ def data(self):
+ if self._data is None:
+ err_msg = "Unrewinded recorder. `rewind` should be called before "
+ err_msg += "accessing recorded data"
+ raise RuntimeError(err_msg)
+ return self._data
+
+ def rewindable(self):
+ return True
+
+ def rewind(self):
+ if self._read_from_cache:
+ self._audio_source.rewind()
+ else:
+ self._data = b"".join(self._cache)
self._cache = None
- self.ads.set_block_size(self._actual_block_size)
- self._hop_size_bytes = self.hop_size * \
- self.get_sample_width() * \
- self.get_channels()
- self._block_size_bytes = self.get_block_size() * \
- self.get_sample_width() * \
- self.get_channels()
- self.read = self._read_first_block
+ self._audio_source = BufferAudioSource(
+ self._data, self.sr, self.sw, self.ch
+ )
+ self._read_block = self._audio_source.read
+ self.open()
+ self._read_from_cache = True
+ def _read_and_cache(self, size):
+ # Read and save read data
+ block = self._audio_source.read(size)
+ if block is not None:
+ self._cache.append(block)
+ return block
- class LimiterADS(ADSDecorator):
- """
- A class for AudioDataSource objects that can read a fixed amount of data.
- This can be useful when reading data from the microphone or from large audio files.
- """
-
- def __init__(self, ads, max_time):
- ADSFactory.ADSDecorator.__init__(self, ads)
-
- self.max_time = max_time
- self._reinit()
-
- def read(self):
- if self._total_read_bytes >= self._max_read_bytes:
- return None
- block = self.ads.read()
- if block is None:
- return None
- self._total_read_bytes += len(block)
-
- if self._total_read_bytes >= self._max_read_bytes:
- self.close()
-
- return block
-
-
- def _reinit(self):
- self._max_read_bytes = int(self.max_time * self.get_sampling_rate()) * \
- self.get_sample_width() * \
- self.get_channels()
- self._total_read_bytes = 0
+class _Limiter(_AudioReadingProxy):
+ """
+ Class for `AudioReader` objects that can read a fixed amount of data.
+ This can be useful when reading data from the microphone or from large
+ audio files.
+ """
-
+ def __init__(self, audio_source, max_read):
+ super(_Limiter, self).__init__(audio_source)
+ self._max_read = max_read
+ self._max_samples = round(max_read * self.sr)
+ self._bytes_per_sample = self.sw * self.ch
+ self._read_samples = 0
- class RecorderADS(ADSDecorator):
- """
- A class for AudioDataSource objects that can record all audio data they read,
- with a rewind facility.
- """
-
- def __init__(self, ads):
- ADSFactory.ADSDecorator.__init__(self, ads)
-
- self._reinit()
-
- def read(self):
- pass
-
- def _read_and_rec(self):
- # Read and save read data
- block = self.ads.read()
- if block is not None:
- self._cache.append(block)
-
+ @property
+ def data(self):
+ data = self._audio_source.data
+ max_read_bytes = self._max_samples * self._bytes_per_sample
+ return data[:max_read_bytes]
+
+ @property
+ def max_read(self):
+ return self._max_read
+
+ def read(self, size):
+ size = min(self._max_samples - self._read_samples, size)
+ if size <= 0:
+ return None
+ block = self._audio_source.read(size)
+ if block is None:
+ return None
+ self._read_samples += len(block) // self._bytes_per_sample
+ return block
+
+ def rewind(self):
+ super(_Limiter, self).rewind()
+ self._read_samples = 0
+
+
+class _FixedSizeAudioReader(_AudioReadingProxy):
+ """
+ Class to read fixed-size audio windows from source.
+ """
+
+ def __init__(self, audio_source, block_dur):
+ super(_FixedSizeAudioReader, self).__init__(audio_source)
+
+ if block_dur <= 0:
+ raise ValueError(
+ "block_dur must be > 0, given: {}".format(block_dur)
+ )
+
+ self._block_size = int(block_dur * self.sr)
+ if self._block_size == 0:
+ err_msg = "Too small block_dur ({0:f}) for sampling rate ({1}). "
+ err_msg += "block_dur should cover at least one sample "
+ err_msg += "(i.e. 1/{1})"
+ raise TooSamllBlockDuration(
+ err_msg.format(block_dur, self.sr), block_dur, self.sr
+ )
+
+ def read(self):
+ return self._audio_source.read(self._block_size)
+
+ @property
+ def block_size(self):
+ return self._block_size
+
+ @property
+ def block_dur(self):
+ return self._block_size / self.sr
+
+ def __getattr__(self, name):
+ return getattr(self._audio_source, name)
+
+
+class _OverlapAudioReader(_FixedSizeAudioReader):
+ """
+ Class for `AudioReader` objects that can read and return overlapping audio
+ windows.
+ """
+
+ def __init__(self, audio_source, block_dur, hop_dur):
+
+ if hop_dur >= block_dur:
+ raise ValueError('"hop_dur" should be < "block_dur"')
+
+ super(_OverlapAudioReader, self).__init__(audio_source, block_dur)
+
+ self._hop_size = int(hop_dur * self.sr)
+ self._blocks = self._iter_blocks_with_overlap()
+
+ def _iter_blocks_with_overlap(self):
+ while not self.is_open():
+ yield AudioIOError
+ block = self._audio_source.read(self._block_size)
+ if block is None:
+ yield None
+
+ _hop_size_bytes = (
+ self._hop_size * self._audio_source.sw * self._audio_source.ch
+ )
+ cache = block[_hop_size_bytes:]
+ yield block
+
+ while True:
+ block = self._audio_source.read(self._hop_size)
+ if block:
+ block = cache + block
+ cache = block[_hop_size_bytes:]
+ yield block
+ continue
+ yield None
+
+ def read(self):
+ try:
+ block = next(self._blocks)
+ if block == AudioIOError:
+ raise AudioIOError("Audio Stream is not open.")
return block
-
-
- def _read_simple(self):
- # Read without recording
- return self.ads.read()
-
- def rewind(self):
- if self._record:
- # If has been recording, create a new BufferAudioSource
- # from recorded data
- dbuffer = self._concatenate(self._cache)
- asource = BufferAudioSource(dbuffer, self.get_sampling_rate(),
- self.get_sample_width(),
- self.get_channels())
-
-
- self.set_audio_source(asource)
- self.open()
- self._cache = []
- self._record = False
- self.read = self._read_simple
-
- else:
- self.ads.rewind()
- if not self.is_open():
- self.open()
-
-
- def is_rewindable(self):
- return True
-
- def _reinit(self):
- # when audio_source is replaced, start recording again
- self._record = True
- self._cache = []
- self.read = self._read_and_rec
-
- def _concatenate(self, data):
- try:
- # should always work for python 2
- # work for python 3 ONLY if data is a list (or an iterator)
- # whose each element is a 'bytes' objects
- return b''.join(data)
- except TypeError:
- # work for 'str' in python 2 and python 3
- return ''.join(data)
+ except StopIteration:
+ return None
+ def rewind(self):
+ super(_OverlapAudioReader, self).rewind()
+ self._blocks = self._iter_blocks_with_overlap()
-class AudioEnergyValidator(DataValidator):
+ @property
+ def hop_size(self):
+ return self._hop_size
+
+ @property
+ def hop_dur(self):
+ return self._hop_size / self.sr
+
+ def __getattr__(self, name):
+ return getattr(self._audio_source, name)
+
+
+class AudioReader(DataSource):
"""
- The most basic auditok audio frame validator.
- This validator computes the log energy of an input audio frame
- and return True if the result is >= a given threshold, False
- otherwise.
-
- :Parameters:
-
- `sample_width` : *(int)*
- Number of bytes of one audio sample. This is used to convert data from `basestring` or `Bytes` to
- an array of floats.
-
- `energy_threshold` : *(float)*
- A threshold used to check whether an input data buffer is valid.
+ Class to read fixed-size chunks of audio data from a source. A source can
+ be a file on disk, standard input (with `input` = "-") or microphone. This
+ is normally used by tokenization algorithms that expect source objects with
+ a `read` function that returns a windows of data of the same size at each
+ call expect when remaining data does not make up a full window.
+
+ Objects of this class can be set up to return audio windows with a given
+ overlap and to record the whole stream for later access (useful when
+ reading data from the microphone). They can also have
+ a limit for the maximum amount of data to read.
+
+ Parameters
+ ----------
+ input : str, bytes, AudioSource, AudioReader, AudioRegion or None
+ input audio data. If the type of the passed argument is `str`, it should
+ be a path to an existing audio file. "-" is interpreted as standardinput.
+ If the type is `bytes`, input is considered as a buffer of raw audio
+ data. If None, read audio from microphone. Every object that is not an
+ :class:`AudioReader` will be transformed, when possible, into an
+ :class:`AudioSource` before processing. If it is an `str` that refers to
+ a raw audio file, `bytes` or None, audio parameters should be provided
+ using kwargs (i.e., `samplig_rate`, `sample_width` and `channels` or
+ their alias).
+ block_dur: float, default: 0.01
+ length in seconds of audio windows to return at each `read` call.
+ hop_dur: float, default: None
+ length in seconds of data amount to skip from previous window. If
+ defined, it is used to compute the temporal overlap between previous and
+ current window (nameply `overlap = block_dur - hop_dur`). Default, None,
+ means that consecutive windows do not overlap.
+ record: bool, default: False
+ whether to record read audio data for later access. If True, audio data
+ can be retrieved by first calling `rewind()`, then using the `data`
+ property. Note that once `rewind()` is called, no new data will be read
+ from source (subsequent `read()` call will read data from cache) and
+ that there's no need to call `rewind()` again to access `data` property.
+ max_read: float, default: None
+ maximum amount of audio data to read in seconds. Default is None meaning
+ that data will be read until end of stream is reached or, when reading
+ from microphone a Ctrl-C is sent.
+
+ When `input` is None, of type bytes or a raw audio files some of the
+ follwing kwargs are mandatory.
+
+ Other Parameters
+ ----------------
+ audio_format, fmt : str
+ type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be
+ used if `input` is a string path to an audio file. If not given, audio
+ type will be guessed from file name extension or from file header.
+ sampling_rate, sr : int
+ sampling rate of audio data. Required if `input` is a raw audio file, is
+ a bytes object or None (i.e., read from microphone).
+ sample_width, sw : int
+ number of bytes used to encode one audio sample, typically 1, 2 or 4.
+ Required for raw data, see `sampling_rate`.
+ channels, ch : int
+ number of channels of audio data. Required for raw data, see
+ `sampling_rate`.
+ use_channel, uc : {None, "any", "mix", "avg", "average"} or int
+ which channel to use for split if `input` has multiple audio channels.
+ Regardless of which channel is used for splitting, returned audio events
+ contain data from *all* the channels of `input`. The following values
+ are accepted:
+
+ - None (alias "any"): accept audio activity from any channel, even if
+ other channels are silent. This is the default behavior.
+
+ - "mix" (alias "avg" or "average"): mix down all channels (i.e., compute
+ average channel) and split the resulting channel.
+
+ - int (>= 0 , < `channels`): use one channel, specified by its integer
+ id, for split.
+
+ large_file : bool, default: False
+ If True, AND if `input` is a path to a *wav* of a *raw* audio file
+ (and only these two formats) then audio data is lazily loaded to memory
+ (i.e., one analysis window a time). Otherwise the whole file is loaded
+ to memory before split. Set to True if the size of the file is larger
+ than available memory.
+ """
+
+ def __init__(
+ self,
+ input,
+ block_dur=0.01,
+ hop_dur=None,
+ record=False,
+ max_read=None,
+ **kwargs
+ ):
+ if not isinstance(input, AudioSource):
+ input = get_audio_source(input, **kwargs)
+ self._record = record
+ if record:
+ input = _Recorder(input)
+ if max_read is not None:
+ input = _Limiter(input, max_read)
+ self._max_read = max_read
+ if hop_dur is not None:
+ input = _OverlapAudioReader(input, block_dur, hop_dur)
+ else:
+ input = _FixedSizeAudioReader(input, block_dur)
+ self._audio_source = input
+
+ def __repr__(self):
+ block_dur, hop_dur, max_read = None, None, None
+ if self.block_dur is not None:
+ block_dur = "{:.3f}".format(self.block_dur)
+ if self.hop_dur is not None:
+ hop_dur = "{:.3f}".format(self.hop_dur)
+ if self.max_read is not None:
+ max_read = "{:.3f}".format(self.max_read)
+ return (
+ "{cls}(block_dur={block_dur}, "
+ "hop_dur={hop_dur}, record={rewindable}, "
+ "max_read={max_read})"
+ ).format(
+ cls=self.__class__.__name__,
+ block_dur=block_dur,
+ hop_dur=hop_dur,
+ rewindable=self._record,
+ max_read=max_read,
+ )
+
+ @property
+ def rewindable(self):
+ return self._record
+
+ @property
+ def block_dur(self):
+ return self._audio_source.block_size / self._audio_source.sr
+
+ @property
+ def hop_dur(self):
+ if hasattr(self._audio_source, "hop_dur"):
+ return self._audio_source.hop_size / self._audio_source.sr
+ return self.block_dur
+
+ @property
+ def hop_size(self):
+ if hasattr(self._audio_source, "hop_size"):
+ return self._audio_source.hop_size
+ return self.block_size
+
+ @property
+ def max_read(self):
+ try:
+ return self._audio_source.max_read
+ except AttributeError:
+ return None
+
+ def read(self):
+ return self._audio_source.read()
+
+ def __getattr__(self, name):
+ if name in ("data", "rewind") and not self.rewindable:
+ raise AttributeError(
+ "'AudioReader' has no attribute '{}'".format(name)
+ )
+ try:
+ return getattr(self._audio_source, name)
+ except AttributeError:
+ raise AttributeError(
+ "'AudioReader' has no attribute '{}'".format(name)
+ )
+
+
+# Keep AudioDataSource for compatibility
+# Remove in a future version when ADSFactory is removed
+AudioDataSource = AudioReader
+
+
+class Recorder(AudioReader):
+ """Class to read fixed-size chunks of audio data from a source and keeps
+ data in a cache. Using this class is equivalent to initializing
+ :class:`AudioReader` with `record=True`. For more information about the
+ other parameters see :class:`AudioReader`.
+
+ Once the desired amount of data is read, you can call the :func:`rewind`
+ method then get the recorded data via the :attr:`data` attribute. You can also
+ re-read cached data one window a time by calling :func:`read`.
"""
-
-
- if _WITH_NUMPY:
-
- _formats = {1: numpy.int8 , 2: numpy.int16, 4: numpy.int32}
-
- @staticmethod
- def _convert(signal, sample_width):
- return numpy.array(numpy.frombuffer(signal, dtype=AudioEnergyValidator._formats[sample_width]), dtype=numpy.float64)
-
- @staticmethod
- def _signal_energy(signal):
- return float(numpy.dot(signal, signal)) / len(signal)
-
- @staticmethod
- def _signal_log_energy(signal):
- energy = AudioEnergyValidator._signal_energy(signal)
- if energy <= 0:
- return -200
- return 10. * numpy.log10(energy)
-
- else:
-
-
- _formats = {1: 'b' , 2: 'h', 4: 'i'}
-
- @staticmethod
- def _convert(signal, sample_width):
- return array("d", array(AudioEnergyValidator._formats[sample_width], signal))
-
- @staticmethod
- def _signal_energy(signal):
- energy = 0.
- for a in signal:
- energy += a * a
- return energy / len(signal)
-
- @staticmethod
- def _signal_log_energy(signal):
- energy = AudioEnergyValidator._signal_energy(signal)
- if energy <= 0:
- return -200
- return 10. * math.log10(energy)
-
-
- def __init__(self, sample_width, energy_threshold=45):
- self.sample_width = sample_width
- self._energy_threshold = energy_threshold
-
-
- def is_valid(self, data):
- """
- Check if data is valid. Audio data will be converted into an array (of
- signed values) of which the log energy is computed. Log energy is computed
- as follows:
-
- .. code:: python
-
- arr = AudioEnergyValidator._convert(signal, sample_width)
- energy = float(numpy.dot(arr, arr)) / len(arr)
- log_energy = 10. * numpy.log10(energy)
-
-
- :Parameters:
-
- `data` : either a *string* or a *Bytes* buffer
- `data` is converted into a numerical array using the `sample_width`
- given in the constructor.
-
- :Retruns:
-
- True if `log_energy` >= `energy_threshold`, False otherwise.
- """
-
- signal = AudioEnergyValidator._convert(data, self.sample_width)
- return AudioEnergyValidator._signal_log_energy(signal) >= self._energy_threshold
-
- def get_energy_threshold(self):
- return self._energy_threshold
-
- def set_energy_threshold(self, threshold):
- self._energy_threshold = threshold
+ def __init__(
+ self, input, block_dur=0.01, hop_dur=None, max_read=None, **kwargs
+ ):
+ super().__init__(
+ input,
+ block_dur=block_dur,
+ hop_dur=hop_dur,
+ record=True,
+ max_read=max_read,
+ **kwargs
+ )
diff --git a/libs/auditok/workers.py b/libs/auditok/workers.py
new file mode 100755
index 000000000..bb6d54a98
--- /dev/null
+++ b/libs/auditok/workers.py
@@ -0,0 +1,427 @@
+import os
+import sys
+from tempfile import NamedTemporaryFile
+from abc import ABCMeta, abstractmethod
+from threading import Thread
+from datetime import datetime, timedelta
+from collections import namedtuple
+import wave
+import subprocess
+from queue import Queue, Empty
+from .io import _guess_audio_format
+from .util import AudioDataSource, make_duration_formatter
+from .core import split
+from .exceptions import (
+ EndOfProcessing,
+ AudioEncodingError,
+ AudioEncodingWarning,
+)
+
+
+_STOP_PROCESSING = "STOP_PROCESSING"
+_Detection = namedtuple("_Detection", "id start end duration")
+
+
+def _run_subprocess(command):
+ try:
+ with subprocess.Popen(
+ command,
+ stdin=open(os.devnull, "rb"),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ ) as proc:
+ stdout, stderr = proc.communicate()
+ return proc.returncode, stdout, stderr
+ except Exception:
+ err_msg = "Couldn't export audio using command: '{}'".format(command)
+ raise AudioEncodingError(err_msg)
+
+
+class Worker(Thread, metaclass=ABCMeta):
+ def __init__(self, timeout=0.5, logger=None):
+ self._timeout = timeout
+ self._logger = logger
+ self._inbox = Queue()
+ Thread.__init__(self)
+
+ def run(self):
+ while True:
+ message = self._get_message()
+ if message == _STOP_PROCESSING:
+ break
+ if message is not None:
+ self._process_message(message)
+ self._post_process()
+
+ @abstractmethod
+ def _process_message(self, message):
+ """Process incoming messages"""
+
+ def _post_process(self):
+ pass
+
+ def _log(self, message):
+ self._logger.info(message)
+
+ def _stop_requested(self):
+ try:
+ message = self._inbox.get_nowait()
+ if message == _STOP_PROCESSING:
+ return True
+ except Empty:
+ return False
+
+ def stop(self):
+ self.send(_STOP_PROCESSING)
+ self.join()
+
+ def send(self, message):
+ self._inbox.put(message)
+
+ def _get_message(self):
+ try:
+ message = self._inbox.get(timeout=self._timeout)
+ return message
+ except Empty:
+ return None
+
+
+class TokenizerWorker(Worker, AudioDataSource):
+ def __init__(self, reader, observers=None, logger=None, **kwargs):
+ self._observers = observers if observers is not None else []
+ self._reader = reader
+ self._audio_region_gen = split(self, **kwargs)
+ self._detections = []
+ self._log_format = "[DET]: Detection {0.id} (start: {0.start:.3f}, "
+ self._log_format += "end: {0.end:.3f}, duration: {0.duration:.3f})"
+ Worker.__init__(self, timeout=0.2, logger=logger)
+
+ def _process_message(self):
+ pass
+
+ @property
+ def detections(self):
+ return self._detections
+
+ def _notify_observers(self, message):
+ for observer in self._observers:
+ observer.send(message)
+
+ def run(self):
+ self._reader.open()
+ start_processing_timestamp = datetime.now()
+ for _id, audio_region in enumerate(self._audio_region_gen, start=1):
+ timestamp = start_processing_timestamp + timedelta(
+ seconds=audio_region.meta.start
+ )
+ audio_region.meta.timestamp = timestamp
+ detection = _Detection(
+ _id,
+ audio_region.meta.start,
+ audio_region.meta.end,
+ audio_region.duration,
+ )
+ self._detections.append(detection)
+ if self._logger is not None:
+ message = self._log_format.format(detection)
+ self._log(message)
+ self._notify_observers((_id, audio_region))
+ self._notify_observers(_STOP_PROCESSING)
+ self._reader.close()
+
+ def start_all(self):
+ for observer in self._observers:
+ observer.start()
+ self.start()
+
+ def stop_all(self):
+ self.stop()
+ for observer in self._observers:
+ observer.stop()
+ self._reader.close()
+
+ def read(self):
+ if self._stop_requested():
+ return None
+ else:
+ return self._reader.read()
+
+ def __getattr__(self, name):
+ return getattr(self._reader, name)
+
+
+class StreamSaverWorker(Worker):
+ def __init__(
+ self,
+ audio_reader,
+ filename,
+ export_format=None,
+ cache_size_sec=0.5,
+ timeout=0.2,
+ ):
+ self._reader = audio_reader
+ sample_size_bytes = self._reader.sw * self._reader.ch
+ self._cache_size = cache_size_sec * self._reader.sr * sample_size_bytes
+ self._output_filename = filename
+ self._export_format = _guess_audio_format(export_format, filename)
+ if self._export_format is None:
+ self._export_format = "wav"
+ self._init_output_stream()
+ self._exported = False
+ self._cache = []
+ self._total_cached = 0
+ Worker.__init__(self, timeout=timeout)
+
+ def _get_non_existent_filename(self):
+ filename = self._output_filename + ".wav"
+ i = 0
+ while os.path.exists(filename):
+ i += 1
+ filename = self._output_filename + "({}).wav".format(i)
+ return filename
+
+ def _init_output_stream(self):
+ if self._export_format != "wav":
+ self._tmp_output_filename = self._get_non_existent_filename()
+ else:
+ self._tmp_output_filename = self._output_filename
+ self._wfp = wave.open(self._tmp_output_filename, "wb")
+ self._wfp.setframerate(self._reader.sr)
+ self._wfp.setsampwidth(self._reader.sw)
+ self._wfp.setnchannels(self._reader.ch)
+
+ @property
+ def sr(self):
+ return self._reader.sampling_rate
+
+ @property
+ def sw(self):
+ return self._reader.sample_width
+
+ @property
+ def ch(self):
+ return self._reader.channels
+
+ def __del__(self):
+ self._post_process()
+
+ if (
+ (self._tmp_output_filename != self._output_filename)
+ and self._exported
+ and os.path.exists(self._tmp_output_filename)
+ ):
+ os.remove(self._tmp_output_filename)
+
+ def _process_message(self, data):
+ self._cache.append(data)
+ self._total_cached += len(data)
+ if self._total_cached >= self._cache_size:
+ self._write_cached_data()
+
+ def _post_process(self):
+ while True:
+ try:
+ data = self._inbox.get_nowait()
+ if data != _STOP_PROCESSING:
+ self._cache.append(data)
+ self._total_cached += len(data)
+ except Empty:
+ break
+ self._write_cached_data()
+ self._wfp.close()
+
+ def _write_cached_data(self):
+ if self._cache:
+ data = b"".join(self._cache)
+ self._wfp.writeframes(data)
+ self._cache = []
+ self._total_cached = 0
+
+ def open(self):
+ self._reader.open()
+
+ def close(self):
+ self._reader.close()
+ self.stop()
+
+ def rewind(self):
+ # ensure compatibility with AudioDataSource with record=True
+ pass
+
+ @property
+ def data(self):
+ with wave.open(self._tmp_output_filename, "rb") as wfp:
+ return wfp.readframes(-1)
+
+ def save_stream(self):
+ if self._exported:
+ return self._output_filename
+
+ if self._export_format in ("raw", "wav"):
+ if self._export_format == "raw":
+ self._export_raw()
+ self._exported = True
+ return self._output_filename
+ try:
+ self._export_with_ffmpeg_or_avconv()
+ except AudioEncodingError:
+ try:
+ self._export_with_sox()
+ except AudioEncodingError:
+ warn_msg = "Couldn't save audio data in the desired format "
+ warn_msg += "'{}'. Either none of 'ffmpeg', 'avconv' or 'sox' "
+ warn_msg += "is installed or this format is not recognized.\n"
+ warn_msg += "Audio file was saved as '{}'"
+ raise AudioEncodingWarning(
+ warn_msg.format(
+ self._export_format, self._tmp_output_filename
+ )
+ )
+ finally:
+ self._exported = True
+ return self._output_filename
+
+ def _export_raw(self):
+ with open(self._output_filename, "wb") as wfp:
+ wfp.write(self.data)
+
+ def _export_with_ffmpeg_or_avconv(self):
+ command = [
+ "-y",
+ "-f",
+ "wav",
+ "-i",
+ self._tmp_output_filename,
+ "-f",
+ self._export_format,
+ self._output_filename,
+ ]
+ returncode, stdout, stderr = _run_subprocess(["ffmpeg"] + command)
+ if returncode != 0:
+ returncode, stdout, stderr = _run_subprocess(["avconv"] + command)
+ if returncode != 0:
+ raise AudioEncodingError(stderr)
+ return stdout, stderr
+
+ def _export_with_sox(self):
+ command = [
+ "sox",
+ "-t",
+ "wav",
+ self._tmp_output_filename,
+ self._output_filename,
+ ]
+ returncode, stdout, stderr = _run_subprocess(command)
+ if returncode != 0:
+ raise AudioEncodingError(stderr)
+ return stdout, stderr
+
+ def close_output(self):
+ self._wfp.close()
+
+ def read(self):
+ data = self._reader.read()
+ if data is not None:
+ self.send(data)
+ else:
+ self.send(_STOP_PROCESSING)
+ return data
+
+ def __getattr__(self, name):
+ if name == "data":
+ return self.data
+ return getattr(self._reader, name)
+
+
+class PlayerWorker(Worker):
+ def __init__(self, player, progress_bar=False, timeout=0.2, logger=None):
+ self._player = player
+ self._progress_bar = progress_bar
+ self._log_format = "[PLAY]: Detection {id} played"
+ Worker.__init__(self, timeout=timeout, logger=logger)
+
+ def _process_message(self, message):
+ _id, audio_region = message
+ if self._logger is not None:
+ message = self._log_format.format(id=_id)
+ self._log(message)
+ audio_region.play(
+ player=self._player, progress_bar=self._progress_bar, leave=False
+ )
+
+
+class RegionSaverWorker(Worker):
+ def __init__(
+ self,
+ filename_format,
+ audio_format=None,
+ timeout=0.2,
+ logger=None,
+ **audio_parameters
+ ):
+ self._filename_format = filename_format
+ self._audio_format = audio_format
+ self._audio_parameters = audio_parameters
+ self._debug_format = "[SAVE]: Detection {id} saved as '{filename}'"
+ Worker.__init__(self, timeout=timeout, logger=logger)
+
+ def _process_message(self, message):
+ _id, audio_region = message
+ filename = self._filename_format.format(
+ id=_id,
+ start=audio_region.meta.start,
+ end=audio_region.meta.end,
+ duration=audio_region.duration,
+ )
+ filename = audio_region.save(
+ filename, self._audio_format, **self._audio_parameters
+ )
+ if self._logger:
+ message = self._debug_format.format(id=_id, filename=filename)
+ self._log(message)
+
+
+class CommandLineWorker(Worker):
+ def __init__(self, command, timeout=0.2, logger=None):
+ self._command = command
+ Worker.__init__(self, timeout=timeout, logger=logger)
+ self._debug_format = "[COMMAND]: Detection {id} command: '{command}'"
+
+ def _process_message(self, message):
+ _id, audio_region = message
+ with NamedTemporaryFile(delete=False) as file:
+ filename = audio_region.save(file.name, audio_format="wav")
+ command = self._command.format(file=filename)
+ os.system(command)
+ if self._logger is not None:
+ message = self._debug_format.format(id=_id, command=command)
+ self._log(message)
+
+
+class PrintWorker(Worker):
+ def __init__(
+ self,
+ print_format="{start} {end}",
+ time_format="%S",
+ timestamp_format="%Y/%m/%d %H:%M:%S.%f",
+ timeout=0.2,
+ ):
+
+ self._print_format = print_format
+ self._format_time = make_duration_formatter(time_format)
+ self._timestamp_format = timestamp_format
+ self.detections = []
+ Worker.__init__(self, timeout=timeout)
+
+ def _process_message(self, message):
+ _id, audio_region = message
+ timestamp = audio_region.meta.timestamp
+ timestamp = timestamp.strftime(self._timestamp_format)
+ text = self._print_format.format(
+ id=_id,
+ start=self._format_time(audio_region.meta.start),
+ end=self._format_time(audio_region.meta.end),
+ duration=self._format_time(audio_region.duration),
+ timestamp=timestamp,
+ )
+ print(text)