aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorMichiel van Baak Jansen <[email protected]>2021-05-01 14:07:20 +0200
committerGitHub <[email protected]>2021-05-01 08:07:20 -0400
commit30ef713fa260dc9d9d3120359cd052808606badb (patch)
tree00a950b4e50852d93652e4afe1c8d5872f3d54b6
parent08e50a8348f0298fcf291ecc7b4bdcddff7e1539 (diff)
downloadbazarr-30ef713fa260dc9d9d3120359cd052808606badb.tar.gz
bazarr-30ef713fa260dc9d9d3120359cd052808606badb.zip
Downgrade auditok to version 0.1.5
ffsubsync pinned auditok to 0.1.5. We missed this when upgrading ffsubsync and auditok. Since we dont run pip to install the libraries, there is no version checks
-rw-r--r--libs/auditok/__init__.py10
-rwxr-xr-x[-rw-r--r--]libs/auditok/cmdline.py1155
-rw-r--r--libs/auditok/cmdline_util.py126
-rw-r--r--libs/auditok/core.py1656
-rw-r--r--libs/auditok/dataset.py24
-rw-r--r--libs/auditok/exceptions.py42
-rw-r--r--libs/auditok/io.py1264
-rw-r--r--libs/auditok/plotting.py150
-rw-r--r--libs/auditok/signal.py179
-rw-r--r--libs/auditok/signal_numpy.py30
-rw-r--r--libs/auditok/util.py1734
-rw-r--r--libs/auditok/workers.py427
-rw-r--r--libs/version.txt2
13 files changed, 2226 insertions, 4573 deletions
diff --git a/libs/auditok/__init__.py b/libs/auditok/__init__.py
index edd336cc3..4ea697b77 100644
--- a/libs/auditok/__init__.py
+++ b/libs/auditok/__init__.py
@@ -2,16 +2,20 @@
:author:
Amine SEHILI <[email protected]>
-2015-2021
+2015-2016
:License:
-This package is published under the MIT license.
+This package is published under GNU GPL Version 3.
"""
+from __future__ import absolute_import
from .core import *
from .io import *
from .util import *
+from . import dataset
from .exceptions import *
-__version__ = "0.2.0"
+__version__ = "0.1.5"
+
+
diff --git a/libs/auditok/cmdline.py b/libs/auditok/cmdline.py
index 7e7450762..b6a51d11b 100644..100755
--- a/libs/auditok/cmdline.py
+++ b/libs/auditok/cmdline.py
@@ -1,428 +1,789 @@
#!/usr/bin/env python
# encoding: utf-8
-"""
-`auditok` -- An Audio Activity Detection tool
+'''
+auditok.auditok -- Audio Activity Detection tool
+
+auditok.auditok is a program that can be used for Audio/Acoustic activity detection.
+It can read audio data from audio files as well as from built-in device(s) or standard input
-`auditok` is a program that can be used for Audio/Acoustic
-activity detection. It can read audio data from audio files as well
-as from the microphone or standard input.
@author: Mohamed El Amine SEHILI
-@copyright: 2015-2021 Mohamed El Amine SEHILI
-@license: MIT
+
+@copyright: 2015 Mohamed El Amine SEHILI
+
+@license: GPL v3
+
-@deffield updated: 01 Mar 2021
-"""
+@deffield updated: 02 Dec 2015
+'''
import sys
import os
-from argparse import ArgumentParser
+
+from optparse import OptionParser, OptionGroup
+from threading import Thread
+import tempfile
+import wave
import time
import threading
+import logging
-from auditok import __version__, AudioRegion
-from .util import AudioDataSource
-from .exceptions import EndOfProcessing, AudioEncodingWarning
-from .io import player_for
-from .cmdline_util import make_logger, make_kwargs, initialize_workers
-from . import workers
+try:
+ import future
+ from queue import Queue, Empty
+except ImportError:
+ if sys.version_info >= (3, 0):
+ from queue import Queue, Empty
+ else:
+ from Queue import Queue, Empty
+try:
+ from pydub import AudioSegment
+ WITH_PYDUB = True
+except ImportError:
+ WITH_PYDUB = False
+
+
+from .core import StreamTokenizer
+from .io import PyAudioSource, BufferAudioSource, StdinAudioSource, player_for
+from .util import ADSFactory, AudioEnergyValidator
+from auditok import __version__ as version
__all__ = []
-__date__ = "2015-11-23"
-__updated__ = "2021-03-01"
+__version__ = version
+__date__ = '2015-11-23'
+__updated__ = '2015-03-11'
+
+DEBUG = 0
+TESTRUN = 1
+PROFILE = 0
+
+LOGGER_NAME = "AUDITOK_LOGGER"
+
+class AudioFileFormatError(Exception):
+ pass
+
+class TimeFormatError(Exception):
+ pass
+
+def file_to_audio_source(filename, filetype=None, **kwargs):
+
+ lower_fname = filename.lower()
+ rawdata = False
+
+ if filetype is not None:
+ filetype = filetype.lower()
+
+ if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")):
+
+ srate = kwargs.pop("sampling_rate", None)
+ if srate is None:
+ srate = kwargs.pop("sr", None)
+
+ swidth = kwargs.pop("sample_width", None)
+ if swidth is None:
+ swidth = kwargs.pop("sw", None)
+
+ ch = kwargs.pop("channels", None)
+ if ch is None:
+ ch = kwargs.pop("ch", None)
+
+ if None in (swidth, srate, ch):
+ raise Exception("All audio parameters are required for raw data")
+
+ data = open(filename).read()
+ rawdata = True
+
+ # try first with pydub
+ if WITH_PYDUB:
+
+ use_channel = kwargs.pop("use_channel", None)
+ if use_channel is None:
+ use_channel = kwargs.pop("uc", None)
+
+ if use_channel is None:
+ use_channel = 1
+ else:
+ try:
+ use_channel = int(use_channel)
+ except ValueError:
+ pass
+
+ if not isinstance(use_channel, (int)) and not use_channel.lower() in ["left", "right", "mix"] :
+ raise ValueError("channel must be an integer or one of 'left', 'right' or 'mix'")
+
+ asegment = None
+
+ if rawdata:
+ asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch)
+ if filetype in("wave", "wav") or (filetype is None and lower_fname.endswith(".wav")):
+ asegment = AudioSegment.from_wav(filename)
+ elif filetype == "mp3" or (filetype is None and lower_fname.endswith(".mp3")):
+ asegment = AudioSegment.from_mp3(filename)
+ elif filetype == "ogg" or (filetype is None and lower_fname.endswith(".ogg")):
+ asegment = AudioSegment.from_ogg(filename)
+ elif filetype == "flv" or (filetype is None and lower_fname.endswith(".flv")):
+ asegment = AudioSegment.from_flv(filename)
+ else:
+ asegment = AudioSegment.from_file(filename)
+
+ if asegment.channels > 1:
+
+ if isinstance(use_channel, int):
+ if use_channel > asegment.channels:
+ raise ValueError("Can not use channel '{0}', audio file has only {1} channels".format(use_channel, asegment.channels))
+ else:
+ asegment = asegment.split_to_mono()[use_channel - 1]
+ else:
+ ch_lower = use_channel.lower()
+
+ if ch_lower == "mix":
+ asegment = asegment.set_channels(1)
+
+ elif use_channel.lower() == "left":
+ asegment = asegment.split_to_mono()[0]
+
+ elif use_channel.lower() == "right":
+ asegment = asegment.split_to_mono()[1]
+
+ return BufferAudioSource(data_buffer = asegment._data,
+ sampling_rate = asegment.frame_rate,
+ sample_width = asegment.sample_width,
+ channels = asegment.channels)
+ # fall back to standard python
+ else:
+ if rawdata:
+ if ch != 1:
+ raise ValueError("Cannot handle multi-channel audio without pydub")
+ return BufferAudioSource(data, srate, swidth, ch)
+
+ if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")):
+
+ wfp = wave.open(filename)
+
+ ch = wfp.getnchannels()
+ if ch != 1:
+ wfp.close()
+ raise ValueError("Cannot handle multi-channel audio without pydub")
+
+ srate = wfp.getframerate()
+ swidth = wfp.getsampwidth()
+ data = wfp.readframes(wfp.getnframes())
+ wfp.close()
+ return BufferAudioSource(data, srate, swidth, ch)
+
+ raise AudioFileFormatError("Cannot read audio file format")
+
+
+def save_audio_data(data, filename, filetype=None, **kwargs):
+
+ lower_fname = filename.lower()
+ if filetype is not None:
+ filetype = filetype.lower()
+
+ # save raw data
+ if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")):
+ fp = open(filename, "w")
+ fp.write(data)
+ fp.close()
+ return
+
+ # save other types of data
+ # requires all audio parameters
+ srate = kwargs.pop("sampling_rate", None)
+ if srate is None:
+ srate = kwargs.pop("sr", None)
+
+ swidth = kwargs.pop("sample_width", None)
+ if swidth is None:
+ swidth = kwargs.pop("sw", None)
+
+ ch = kwargs.pop("channels", None)
+ if ch is None:
+ ch = kwargs.pop("ch", None)
+
+ if None in (swidth, srate, ch):
+ raise Exception("All audio parameters are required to save no raw data")
+
+ if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")):
+ # use standard python's wave module
+ fp = wave.open(filename, "w")
+ fp.setnchannels(ch)
+ fp.setsampwidth(swidth)
+ fp.setframerate(srate)
+ fp.writeframes(data)
+ fp.close()
+
+ elif WITH_PYDUB:
+
+ asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch)
+ asegment.export(filename, format=filetype)
+
+ else:
+ raise AudioFileFormatError("cannot write file format {0} (file name: {1})".format(filetype, filename))
+
+
+def plot_all(signal, sampling_rate, energy_as_amp, detections=[], show=True, save_as=None):
+
+ import matplotlib.pyplot as plt
+ import numpy as np
+ t = np.arange(0., np.ceil(float(len(signal))) / sampling_rate, 1./sampling_rate )
+ if len(t) > len(signal):
+ t = t[: len(signal) - len(t)]
+
+ for start, end in detections:
+ p = plt.axvspan(start, end, facecolor='g', ec = 'r', lw = 2, alpha=0.4)
+
+ line = plt.axhline(y=energy_as_amp, lw=1, ls="--", c="r", label="Energy threshold as normalized amplitude")
+ plt.plot(t, signal)
+ legend = plt.legend(["Detection threshold"], bbox_to_anchor=(0., 1.02, 1., .102), loc=1, fontsize=16)
+ ax = plt.gca().add_artist(legend)
+
+ plt.xlabel("Time (s)", fontsize=24)
+ plt.ylabel("Amplitude (normalized)", fontsize=24)
+
+ if save_as is not None:
+ plt.savefig(save_as, dpi=120)
+
+ if show:
+ plt.show()
+
+
+def seconds_to_str_fromatter(_format):
+ """
+ Accepted format directives: %i %s %m %h
+ """
+ # check directives are correct
+
+ if _format == "%S":
+ def _fromatter(seconds):
+ return "{:.2f}".format(seconds)
+
+ elif _format == "%I":
+ def _fromatter(seconds):
+ return "{0}".format(int(seconds * 1000))
+
+ else:
+ _format = _format.replace("%h", "{hrs:02d}")
+ _format = _format.replace("%m", "{mins:02d}")
+ _format = _format.replace("%s", "{secs:02d}")
+ _format = _format.replace("%i", "{millis:03d}")
+
+ try:
+ i = _format.index("%")
+ raise TimeFormatError("Unknow time format directive '{0}'".format(_format[i:i+2]))
+ except ValueError:
+ pass
+
+ def _fromatter(seconds):
+ millis = int(seconds * 1000)
+ hrs, millis = divmod(millis, 3600000)
+ mins, millis = divmod(millis, 60000)
+ secs, millis = divmod(millis, 1000)
+ return _format.format(hrs=hrs, mins=mins, secs=secs, millis=millis)
+
+ return _fromatter
+
+
+
+class Worker(Thread):
+
+ def __init__(self, timeout=0.2, debug=False, logger=None):
+ self.timeout = timeout
+ self.debug = debug
+ self.logger = logger
+
+ if self.debug and self.logger is None:
+ self.logger = logging.getLogger(LOGGER_NAME)
+ self.logger.setLevel(logging.DEBUG)
+ handler = logging.StreamHandler(sys.stdout)
+ self.logger.addHandler(handler)
+
+ self._inbox = Queue()
+ self._stop_request = Queue()
+ Thread.__init__(self)
+
+
+ def debug_message(self, message):
+ self.logger.debug(message)
+
+ def _stop_requested(self):
+
+ try:
+ message = self._stop_request.get_nowait()
+ if message == "stop":
+ return True
+
+ except Empty:
+ return False
+
+ def stop(self):
+ self._stop_request.put("stop")
+ self.join()
+
+ def send(self, message):
+ self._inbox.put(message)
+
+ def _get_message(self):
+ try:
+ message = self._inbox.get(timeout=self.timeout)
+ return message
+ except Empty:
+ return None
+
+
+class TokenizerWorker(Worker):
+
+ END_OF_PROCESSING = "END_OF_PROCESSING"
+
+ def __init__(self, ads, tokenizer, analysis_window, observers):
+ self.ads = ads
+ self.tokenizer = tokenizer
+ self.analysis_window = analysis_window
+ self.observers = observers
+ self._inbox = Queue()
+ self.count = 0
+ Worker.__init__(self)
+
+ def run(self):
+
+ def notify_observers(data, start, end):
+ audio_data = b''.join(data)
+ self.count += 1
+
+ start_time = start * self.analysis_window
+ end_time = (end+1) * self.analysis_window
+ duration = (end - start + 1) * self.analysis_window
+
+ # notify observers
+ for observer in self.observers:
+ observer.notify({"id" : self.count,
+ "audio_data" : audio_data,
+ "start" : start,
+ "end" : end,
+ "start_time" : start_time,
+ "end_time" : end_time,
+ "duration" : duration}
+ )
+
+ self.ads.open()
+ self.tokenizer.tokenize(data_source=self, callback=notify_observers)
+ for observer in self.observers:
+ observer.notify(TokenizerWorker.END_OF_PROCESSING)
+
+ def add_observer(self, observer):
+ self.observers.append(observer)
+
+ def remove_observer(self, observer):
+ self.observers.remove(observer)
+
+ def read(self):
+ if self._stop_requested():
+ return None
+ else:
+ return self.ads.read()
+
+
+class PlayerWorker(Worker):
+
+ def __init__(self, player, timeout=0.2, debug=False, logger=None):
+ self.player = player
+ Worker.__init__(self, timeout=timeout, debug=debug, logger=logger)
+
+ def run(self):
+ while True:
+ if self._stop_requested():
+ break
+
+ message = self._get_message()
+ if message is not None:
+ if message == TokenizerWorker.END_OF_PROCESSING:
+ break
+
+ audio_data = message.pop("audio_data", None)
+ start_time = message.pop("start_time", None)
+ end_time = message.pop("end_time", None)
+ dur = message.pop("duration", None)
+ _id = message.pop("id", None)
+
+ if audio_data is not None:
+ if self.debug:
+ self.debug_message("[PLAY]: Detection {id} played (start:{start}, end:{end}, dur:{dur})".format(id=_id,
+ start="{:5.2f}".format(start_time), end="{:5.2f}".format(end_time), dur="{:5.2f}".format(dur)))
+ self.player.play(audio_data)
+
+ def notify(self, message):
+ self.send(message)
+
+
+class CommandLineWorker(Worker):
+
+ def __init__(self, command, timeout=0.2, debug=False, logger=None):
+ self.command = command
+ Worker.__init__(self, timeout=timeout, debug=debug, logger=logger)
+
+ def run(self):
+ while True:
+ if self._stop_requested():
+ break
+
+ message = self._get_message()
+ if message is not None:
+ if message == TokenizerWorker.END_OF_PROCESSING:
+ break
+
+ audio_data = message.pop("audio_data", None)
+ _id = message.pop("id", None)
+ if audio_data is not None:
+ raw_audio_file = tempfile.NamedTemporaryFile(delete=False)
+ raw_audio_file.write(audio_data)
+ cmd = self.command.replace("$", raw_audio_file.name)
+ if self.debug:
+ self.debug_message("[CMD ]: Detection {id} command: {cmd}".format(id=_id, cmd=cmd))
+ os.system(cmd)
+ os.unlink(raw_audio_file.name)
+
+ def notify(self, message):
+ self.send(message)
+
+
+class TokenSaverWorker(Worker):
+
+ def __init__(self, name_format, filetype, timeout=0.2, debug=False, logger=None, **kwargs):
+ self.name_format = name_format
+ self.filetype = filetype
+ self.kwargs = kwargs
+ Worker.__init__(self, timeout=timeout, debug=debug, logger=logger)
+
+ def run(self):
+ while True:
+ if self._stop_requested():
+ break
+
+ message = self._get_message()
+ if message is not None:
+ if message == TokenizerWorker.END_OF_PROCESSING:
+ break
+
+ audio_data = message.pop("audio_data", None)
+ start_time = message.pop("start_time", None)
+ end_time = message.pop("end_time", None)
+ _id = message.pop("id", None)
+ if audio_data is not None and len(audio_data) > 0:
+ fname = self.name_format.format(N=_id, start = "{:.2f}".format(start_time), end = "{:.2f}".format(end_time))
+ try:
+ if self.debug:
+ self.debug_message("[SAVE]: Detection {id} saved as {fname}".format(id=_id, fname=fname))
+ save_audio_data(audio_data, fname, filetype=self.filetype, **self.kwargs)
+ except Exception as e:
+ sys.stderr.write(str(e) + "\n")
+
+ def notify(self, message):
+ self.send(message)
+
+
+class LogWorker(Worker):
+
+ def __init__(self, print_detections=False, output_format="{start} {end}",
+ time_formatter=seconds_to_str_fromatter("%S"), timeout=0.2, debug=False, logger=None):
+
+ self.print_detections = print_detections
+ self.output_format = output_format
+ self.time_formatter = time_formatter
+ self.detections = []
+ Worker.__init__(self, timeout=timeout, debug=debug, logger=logger)
+
+ def run(self):
+ while True:
+ if self._stop_requested():
+ break
+
+ message = self._get_message()
+
+ if message is not None:
+
+ if message == TokenizerWorker.END_OF_PROCESSING:
+ break
+
+ audio_data = message.pop("audio_data", None)
+ _id = message.pop("id", None)
+ start = message.pop("start", None)
+ end = message.pop("end", None)
+ start_time = message.pop("start_time", None)
+ end_time = message.pop("end_time", None)
+ if audio_data is not None and len(audio_data) > 0:
+
+ if self.debug:
+ self.debug_message("[DET ]: Detection {id} (start:{start}, end:{end})".format(id=_id,
+ start="{:5.2f}".format(start_time),
+ end="{:5.2f}".format(end_time)))
+
+ if self.print_detections:
+ print(self.output_format.format(id = _id,
+ start = self.time_formatter(start_time),
+ end = self.time_formatter(end_time)))
+
+ self.detections.append((_id, start, end, start_time, end_time))
+
+
+ def notify(self, message):
+ self.send(message)
+
def main(argv=None):
+ '''Command line options.'''
+
program_name = os.path.basename(sys.argv[0])
+ program_version = version
+ program_build_date = "%s" % __updated__
+
+ program_version_string = '%%prog %s (%s)' % (program_version, program_build_date)
+ #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse
+ program_longdesc = '''''' # optional - give further explanation about what the program does
+ program_license = "Copyright 2015 Mohamed El Amine SEHILI \
+ Licensed under the General Public License (GPL) Version 3 \nhttp://www.gnu.org/licenses/"
+
if argv is None:
argv = sys.argv[1:]
try:
- parser = ArgumentParser(
- prog=program_name, description="An Audio Tokenization tool"
- )
- parser.add_argument(
- "--version", "-v", action="version", version=__version__
- )
- group = parser.add_argument_group("Input-Output options")
- group.add_argument(
- dest="input",
- help="Input audio or video file. Use '-' for stdin "
- "[default: read from microphone using pyaudio]",
- metavar="input",
- nargs="?",
- default=None,
- )
- group.add_argument(
- "-I",
- "--input-device-index",
- dest="input_device_index",
- help="Audio device index [default: %(default)s]. "
- "Optional and only effective when using PyAudio",
- type=int,
- default=None,
- metavar="INT",
- )
- group.add_argument(
- "-F",
- "--audio-frame-per-buffer",
- dest="frame_per_buffer",
- help="Audio frame per buffer [default: %(default)s]. "
- "Optional and only effective when using PyAudio",
- type=int,
- default=1024,
- metavar="INT",
- )
- group.add_argument(
- "-f",
- "--input-format",
- dest="input_format",
- type=str,
- default=None,
- help="Input audio file format. If not given, guess format from "
- "extension. If output file name has no extension, guess format "
- "from file header (requires pydub). If none of the previous is "
- "true, raise an error",
- metavar="STRING",
- )
- group.add_argument(
- "-M",
- "--max-read",
- dest="max_read",
- type=float,
- default=None,
- help="Maximum data (in seconds) to read from microphone or file "
- "[default: read until the end of file/stream]",
- metavar="FLOAT",
- )
- group.add_argument(
- "-L",
- "--large-file",
- dest="large_file",
- action="store_true",
- default=False,
- help="Whether input file should be treated as a large file. "
- "If True, data will be read from file on demand, otherwise all "
- "audio data is loaded to memory before tokenization.",
- )
- group.add_argument(
- "-O",
- "--save-stream",
- dest="save_stream",
- type=str,
- default=None,
- help="Save acquired audio data (from file or microphone) to disk."
- " If omitted no data will be saved. [default: omitted]",
- metavar="FILE",
- )
- group.add_argument(
- "-o",
- "--save-detections-as",
- dest="save_detections_as",
- type=str,
- default=None,
- help="File name format for detections."
- "The following placeholders can be used to build output file name "
- "for each detection: {id} (sequential, starts from 1), {start}, "
- "{end} and {duration}. Time placeholders are in seconds. "
- "Example: 'Event_{id}_{start}-{end}_{duration:.3f}.wav'",
- metavar="STRING",
- )
- group.add_argument(
- "-T",
- "--output-format",
- dest="output_format",
- type=str,
- default=None,
- help="Audio format used to save detections and/or main stream. "
- "If not supplied, then it will: (1. be guessed from extension or "
- "(2. use raw format",
- metavar="STRING",
- )
- group.add_argument(
- "-u",
- "--use-channel",
- dest="use_channel",
- type=str,
- default=None,
- help="Which channel to use for tokenization when input stream is "
- "multi-channel (0 is the first channel). Default is None, meaning "
- "that all channels will be considered for tokenization (i.e., get "
- "any valid audio event regardless of the channel it occurs in). "
- "This value can also be 'mix' (alias 'avg' or 'average') and "
- "means mix down all audio channels into one channel (i.e. compute "
- "average channel) and use the resulting channel for tokenization. "
- "Whatever option is used, saved audio events will contain the same"
- " number of channels as input stream. "
- "[Default: None, use all channels]",
- metavar="INT/STRING",
- )
-
- group = parser.add_argument_group(
- "Tokenization options", "Set tokenizer options."
- )
- group.add_argument(
- "-a",
- "--analysis-window",
- dest="analysis_window",
- default=0.01,
- type=float,
- help="Size of analysis window in seconds [default: %(default)s "
- "(10ms)]",
- metavar="FLOAT",
- )
- group.add_argument(
- "-n",
- "--min-duration",
- dest="min_duration",
- type=float,
- default=0.2,
- help="Min duration of a valid audio event in seconds "
- "[default: %(default)s]",
- metavar="FLOAT",
- )
- group.add_argument(
- "-m",
- "--max-duration",
- dest="max_duration",
- type=float,
- default=5,
- help="Max duration of a valid audio event in seconds "
- "[default: %(default)s]",
- metavar="FLOAT",
- )
- group.add_argument(
- "-s",
- "--max-silence",
- dest="max_silence",
- type=float,
- default=0.3,
- help="Max duration of a consecutive silence within a valid audio "
- "event in seconds [default: %(default)s]",
- metavar="FLOAT",
- )
- group.add_argument(
- "-d",
- "--drop-trailing-silence",
- dest="drop_trailing_silence",
- action="store_true",
- default=False,
- help="Drop trailing silence from a detection [default: keep "
- "trailing silence]",
- )
- group.add_argument(
- "-R",
- "--strict-min-duration",
- dest="strict_min_duration",
- action="store_true",
- default=False,
- help="Reject an event shorter than --min-duration even if it's "
- "adjacent to the latest valid event that reached max-duration "
- "[default: keep such events]",
- )
- group.add_argument(
- "-e",
- "--energy-threshold",
- dest="energy_threshold",
- type=float,
- default=50,
- help="Log energy threshold for detection [default: %(default)s]",
- metavar="FLOAT",
- )
-
- group = parser.add_argument_group(
- "Audio parameters",
- "Define audio parameters if data is read from a "
- "headerless file (raw or stdin) or you want to use "
- "different microphone parameters.",
- )
- group.add_argument(
- "-r",
- "--rate",
- dest="sampling_rate",
- type=int,
- default=16000,
- help="Sampling rate of audio data [default: %(default)s]",
- metavar="INT",
- )
- group.add_argument(
- "-c",
- "--channels",
- dest="channels",
- type=int,
- default=1,
- help="Number of channels of audio data [default: %(default)s]",
- metavar="INT",
- )
- group.add_argument(
- "-w",
- "--width",
- dest="sample_width",
- type=int,
- default=2,
- help="Number of bytes per audio sample [default: %(default)s]",
- metavar="INT",
- )
-
- group = parser.add_argument_group(
- "Do something with audio events",
- "Use these options to print, play back or plot detections.",
- )
- group.add_argument(
- "-C",
- "--command",
- dest="command",
- type=str,
- help="Command to call when an audio detection occurs. Use '{file}' "
- "as a placeholder for the temporary wav file that will contain "
- "event's data (e.g., \"-C 'du -h {file}'\" to print out file size "
- " or \"-C 'play -q {file}'\" to play audio with sox)",
- metavar="STRING",
- )
- group.add_argument(
- "-E",
- "--echo",
- dest="echo",
- action="store_true",
- default=False,
- help="Play back each detection immediately using pyaudio",
- )
- group.add_argument(
- "-B",
- "--progress-bar",
- dest="progress_bar",
- action="store_true",
- default=False,
- help="Show a progress bar when playing audio",
- )
- group.add_argument(
- "-p",
- "--plot",
- dest="plot",
- action="store_true",
- default=False,
- help="Plot and show audio signal and detections (requires "
- "matplotlib)",
- )
- group.add_argument(
- "--save-image",
- dest="save_image",
- type=str,
- help="Save plotted audio signal and detections as a picture or a "
- "PDF file (requires matplotlib)",
- metavar="FILE",
- )
- group.add_argument(
- "--printf",
- dest="printf",
- type=str,
- default="{id} {start} {end}",
- help="Print audio events information, one per line, using this "
- "format. Format can contain text with the following placeholders: "
- "{id} (sequential, starts from 1), {start}, {end}, {duration} and "
- "{timestamp}. The first 3 time placeholders are in seconds and "
- "their format can be set using --time-format argument. "
- "{timestamp} is the system timestamp (date and time) of the event "
- "and can be set using --timestamp-format argument.\n"
- "Example: '[{id}]: {start} -> {end} -- {timestamp}'",
- metavar="STRING",
- )
- group.add_argument(
- "--time-format",
- dest="time_format",
- type=str,
- default="%S",
- help="Format used to print {start}, {end} and {duration} "
- "placeholders used with --printf [default= %(default)s]. The "
- "following formats are accepted:\n"
- "%%S: absolute time in seconds. %%I: absolute time in ms. If at "
- "least one of (%%h, %%m, %%s, %%i) is used, convert time into "
- "hours, minutes, seconds and millis (e.g. %%h:%%m:%%s.%%i). Only "
- "supplied fields are printed. Note that %%S and %%I can only be "
- "used alone",
- metavar="STRING",
- )
- group.add_argument(
- "--timestamp-format",
- dest="timestamp_format",
- type=str,
- default="%Y/%m/%d %H:%M:%S",
- help="Format used to print {timestamp}. Should be a format "
- "accepted by 'datetime' standard module. Default: "
- "'%%Y/%%m/%%d %%H:%%M:%%S'",
- )
- parser.add_argument(
- "-q",
- "--quiet",
- dest="quiet",
- action="store_true",
- default=False,
- help="Do not print any information about detections [default: "
- "print 'id', 'start' and 'end' of each detection]",
- )
- parser.add_argument(
- "-D",
- "--debug",
- dest="debug",
- action="store_true",
- default=False,
- help="Print processing operations to STDOUT",
- )
- parser.add_argument(
- "--debug-file",
- dest="debug_file",
- type=str,
- default=None,
- help="Print processing operations to FILE",
- metavar="FILE",
- )
-
- args = parser.parse_args(argv)
- logger = make_logger(args.debug, args.debug_file)
- kwargs = make_kwargs(args)
- reader, observers = initialize_workers(
- logger=logger, **kwargs.io, **kwargs.miscellaneous
- )
- tokenizer_worker = workers.TokenizerWorker(
- reader, observers, logger=logger, **kwargs.split
- )
- tokenizer_worker.start_all()
+ # setup option parser
+ parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license)
+
+ group = OptionGroup(parser, "[Input-Output options]")
+ group.add_option("-i", "--input", dest="input", help="Input audio or video file. Use - for stdin [default: read from microphone using pyaudio]", metavar="FILE")
+ group.add_option("-t", "--input-type", dest="input_type", help="Input audio file type. Mandatory if file name has no extension [default: %default]", type=str, default=None, metavar="String")
+ group.add_option("-M", "--max_time", dest="max_time", help="Max data (in seconds) to read from microphone/file [default: read until the end of file/stream]", type=float, default=None, metavar="FLOAT")
+ group.add_option("-O", "--output-main", dest="output_main", help="Save main stream as. If omitted main stream will not be saved [default: omitted]", type=str, default=None, metavar="FILE")
+ group.add_option("-o", "--output-tokens", dest="output_tokens", help="Output file name format for detections. Use {N} and {start} and {end} to build file names, example: 'Det_{N}_{start}-{end}.wav'", type=str, default=None, metavar="STRING")
+ group.add_option("-T", "--output-type", dest="output_type", help="Audio type used to save detections and/or main stream. If not supplied will: (1). guess from extension or (2). use wav format", type=str, default=None, metavar="STRING")
+ group.add_option("-u", "--use-channel", dest="use_channel", help="Choose channel to use from a multi-channel audio file (requires pydub). 'left', 'right' and 'mix' are accepted values. [Default: 1 (i.e. 1st or left channel)]", type=str, default="1", metavar="STRING")
+ parser.add_option_group(group)
+
+
+ group = OptionGroup(parser, "[Tokenization options]", "Set tokenizer options and energy threshold.")
+ group.add_option("-a", "--analysis-window", dest="analysis_window", help="Size of analysis window in seconds [default: %default (10ms)]", type=float, default=0.01, metavar="FLOAT")
+ group.add_option("-n", "--min-duration", dest="min_duration", help="Min duration of a valid audio event in seconds [default: %default]", type=float, default=0.2, metavar="FLOAT")
+ group.add_option("-m", "--max-duration", dest="max_duration", help="Max duration of a valid audio event in seconds [default: %default]", type=float, default=5, metavar="FLOAT")
+ group.add_option("-s", "--max-silence", dest="max_silence", help="Max duration of a consecutive silence within a valid audio event in seconds [default: %default]", type=float, default=0.3, metavar="FLOAT")
+ group.add_option("-d", "--drop-trailing-silence", dest="drop_trailing_silence", help="Drop trailing silence from a detection [default: keep trailing silence]", action="store_true", default=False)
+ group.add_option("-e", "--energy-threshold", dest="energy_threshold", help="Log energy threshold for detection [default: %default]", type=float, default=50, metavar="FLOAT")
+ parser.add_option_group(group)
+
+
+ group = OptionGroup(parser, "[Audio parameters]", "Define audio parameters if data is read from a headerless file (raw or stdin) or you want to use different microphone parameters.")
+ group.add_option("-r", "--rate", dest="sampling_rate", help="Sampling rate of audio data [default: %default]", type=int, default=16000, metavar="INT")
+ group.add_option("-c", "--channels", dest="channels", help="Number of channels of audio data [default: %default]", type=int, default=1, metavar="INT")
+ group.add_option("-w", "--width", dest="sample_width", help="Number of bytes per audio sample [default: %default]", type=int, default=2, metavar="INT")
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "[Do something with detections]", "Use these options to print, play or plot detections.")
+ group.add_option("-C", "--command", dest="command", help="Command to call when an audio detection occurs. Use $ to represent the file name to use with the command (e.g. -C 'du -h $')", default=None, type=str, metavar="STRING")
+ group.add_option("-E", "--echo", dest="echo", help="Play back each detection immediately using pyaudio [default: do not play]", action="store_true", default=False)
+ group.add_option("-p", "--plot", dest="plot", help="Plot and show audio signal and detections (requires matplotlib)", action="store_true", default=False)
+ group.add_option("", "--save-image", dest="save_image", help="Save plotted audio signal and detections as a picture or a PDF file (requires matplotlib)", type=str, default=None, metavar="FILE")
+ group.add_option("", "--printf", dest="printf", help="print detections one per line using a user supplied format (e.g. '[{id}]: {start} -- {end}'). Available keywords {id}, {start} and {end}", type=str, default="{id} {start} {end}", metavar="STRING")
+ group.add_option("", "--time-format", dest="time_format", help="format used to print {start} and {end}. [Default= %default]. %S: absolute time in sec. %I: absolute time in ms. If at least one of (%h, %m, %s, %i) is used, convert time into hours, minutes, seconds and millis (e.g. %h:%m:%s.%i). Only required fields are printed", type=str, default="%S", metavar="STRING")
+ parser.add_option_group(group)
+
+ parser.add_option("-q", "--quiet", dest="quiet", help="Do not print any information about detections [default: print 'id', 'start' and 'end' of each detection]", action="store_true", default=False)
+ parser.add_option("-D", "--debug", dest="debug", help="Print processing operations to STDOUT", action="store_true", default=False)
+ parser.add_option("", "--debug-file", dest="debug_file", help="Print processing operations to FILE", type=str, default=None, metavar="FILE")
+
+
+ # process options
+ (opts, args) = parser.parse_args(argv)
+
+ if opts.input == "-":
+ asource = StdinAudioSource(sampling_rate = opts.sampling_rate,
+ sample_width = opts.sample_width,
+ channels = opts.channels)
+ #read data from a file
+ elif opts.input is not None:
+ asource = file_to_audio_source(filename=opts.input, filetype=opts.input_type, uc=opts.use_channel)
+
+ # read data from microphone via pyaudio
+ else:
+ try:
+ asource = PyAudioSource(sampling_rate = opts.sampling_rate,
+ sample_width = opts.sample_width,
+ channels = opts.channels)
+ except Exception:
+ sys.stderr.write("Cannot read data from audio device!\n")
+ sys.stderr.write("You should either install pyaudio or read data from STDIN\n")
+ sys.exit(2)
+
+ logger = logging.getLogger(LOGGER_NAME)
+ logger.setLevel(logging.DEBUG)
+
+ handler = logging.StreamHandler(sys.stdout)
+ if opts.quiet or not opts.debug:
+ # only critical messages will be printed
+ handler.setLevel(logging.CRITICAL)
+ else:
+ handler.setLevel(logging.DEBUG)
+
+ logger.addHandler(handler)
+
+ if opts.debug_file is not None:
+ logger.setLevel(logging.DEBUG)
+ opts.debug = True
+ handler = logging.FileHandler(opts.debug_file, "w")
+ fmt = logging.Formatter('[%(asctime)s] | %(message)s')
+ handler.setFormatter(fmt)
+ handler.setLevel(logging.DEBUG)
+ logger.addHandler(handler)
+
+ record = opts.output_main is not None or opts.plot or opts.save_image is not None
+
+ ads = ADSFactory.ads(audio_source = asource, block_dur = opts.analysis_window, max_time = opts.max_time, record = record)
+ validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=opts.energy_threshold)
+
+
+ if opts.drop_trailing_silence:
+ mode = StreamTokenizer.DROP_TRAILING_SILENCE
+ else:
+ mode = 0
+
+ analysis_window_per_second = 1. / opts.analysis_window
+ tokenizer = StreamTokenizer(validator=validator, min_length=opts.min_duration * analysis_window_per_second,
+ max_length=int(opts.max_duration * analysis_window_per_second),
+ max_continuous_silence=opts.max_silence * analysis_window_per_second,
+ mode = mode)
+
+
+ observers = []
+ tokenizer_worker = None
+
+ if opts.output_tokens is not None:
+
+ try:
+ # check user format is correct
+ fname = opts.output_tokens.format(N=0, start=0, end=0)
+
+ # find file type for detections
+ tok_type = opts.output_type
+ if tok_type is None:
+ tok_type = os.path.splitext(opts.output_tokens)[1][1:]
+ if tok_type == "":
+ tok_type = "wav"
+
+ token_saver = TokenSaverWorker(name_format=opts.output_tokens, filetype=tok_type,
+ debug=opts.debug, logger=logger, sr=asource.get_sampling_rate(),
+ sw=asource.get_sample_width(),
+ ch=asource.get_channels())
+ observers.append(token_saver)
+
+ except Exception:
+ sys.stderr.write("Wrong format for detections file name: '{0}'\n".format(opts.output_tokens))
+ sys.exit(2)
+
+ if opts.echo:
+ try:
+ player = player_for(asource)
+ player_worker = PlayerWorker(player=player, debug=opts.debug, logger=logger)
+ observers.append(player_worker)
+ except Exception:
+ sys.stderr.write("Cannot get an audio player!\n")
+ sys.stderr.write("You should either install pyaudio or supply a command (-C option) to play audio\n")
+ sys.exit(2)
+
+ if opts.command is not None and len(opts.command) > 0:
+ cmd_worker = CommandLineWorker(command=opts.command, debug=opts.debug, logger=logger)
+ observers.append(cmd_worker)
+
+ if not opts.quiet or opts.plot is not None or opts.save_image is not None:
+ oformat = opts.printf.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "\r")
+ converter = seconds_to_str_fromatter(opts.time_format)
+ log_worker = LogWorker(print_detections = not opts.quiet, output_format=oformat,
+ time_formatter=converter, logger=logger, debug=opts.debug)
+ observers.append(log_worker)
+
+ tokenizer_worker = TokenizerWorker(ads, tokenizer, opts.analysis_window, observers)
+
+ def _save_main_stream():
+ # find file type
+ main_type = opts.output_type
+ if main_type is None:
+ main_type = os.path.splitext(opts.output_main)[1][1:]
+ if main_type == "":
+ main_type = "wav"
+ ads.close()
+ ads.rewind()
+ data = ads.get_audio_source().get_data_buffer()
+ if len(data) > 0:
+ save_audio_data(data=data, filename=opts.output_main, filetype=main_type, sr=asource.get_sampling_rate(),
+ sw = asource.get_sample_width(),
+ ch = asource.get_channels())
+
+ def _plot():
+ import numpy as np
+ ads.close()
+ ads.rewind()
+ data = ads.get_audio_source().get_data_buffer()
+ signal = AudioEnergyValidator._convert(data, asource.get_sample_width())
+ detections = [(det[3] , det[4]) for det in log_worker.detections]
+ max_amplitude = 2**(asource.get_sample_width() * 8 - 1) - 1
+ energy_as_amp = np.sqrt(np.exp(opts.energy_threshold * np.log(10) / 10)) / max_amplitude
+ plot_all(signal / max_amplitude, asource.get_sampling_rate(), energy_as_amp, detections, show = opts.plot, save_as = opts.save_image)
+
+
+ # start observer threads
+ for obs in observers:
+ obs.start()
+ # start tokenization thread
+ tokenizer_worker.start()
+
while True:
time.sleep(1)
if len(threading.enumerate()) == 1:
- raise EndOfProcessing
-
- except (KeyboardInterrupt, EndOfProcessing):
+ break
+
+ tokenizer_worker = None
+
+ if opts.output_main is not None:
+ _save_main_stream()
+ if opts.plot or opts.save_image is not None:
+ _plot()
+
+ return 0
+
+ except KeyboardInterrupt:
+
if tokenizer_worker is not None:
- tokenizer_worker.stop_all()
-
- if isinstance(reader, workers.StreamSaverWorker):
- reader.join()
- try:
- reader.save_stream()
- except AudioEncodingWarning as ae_warn:
- print(str(ae_warn), file=sys.stderr)
-
- if args.plot or args.save_image is not None:
- from .plotting import plot
-
- reader.rewind()
- record = AudioRegion(
- reader.data, reader.sr, reader.sw, reader.ch
- )
- detections = (
- (det.start, det.end) for det in tokenizer_worker.detections
- )
- plot(
- record,
- detections=detections,
- energy_threshold=args.energy_threshold,
- show=True,
- save_as=args.save_image,
- )
+ tokenizer_worker.stop()
+ for obs in observers:
+ obs.stop()
+
+ if opts.output_main is not None:
+ _save_main_stream()
+ if opts.plot or opts.save_image is not None:
+ _plot()
+
return 0
+ except Exception as e:
+ sys.stderr.write(program_name + ": " + str(e) + "\n")
+ sys.stderr.write("for help use -h\n")
+
+ return 2
if __name__ == "__main__":
- sys.exit(main(None))
+ if DEBUG:
+ sys.argv.append("-h")
+ if TESTRUN:
+ import doctest
+ doctest.testmod()
+ if PROFILE:
+ import cProfile
+ import pstats
+ profile_filename = 'auditok.auditok_profile.txt'
+ cProfile.run('main()', profile_filename)
+ statsfile = open("profile_stats.txt", "wb")
+ p = pstats.Stats(profile_filename, stream=statsfile)
+ stats = p.strip_dirs().sort_stats('cumulative')
+ stats.print_stats()
+ statsfile.close()
+ sys.exit(0)
+ sys.exit(main())
diff --git a/libs/auditok/cmdline_util.py b/libs/auditok/cmdline_util.py
deleted file mode 100644
index 20e4ac814..000000000
--- a/libs/auditok/cmdline_util.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import sys
-import logging
-from collections import namedtuple
-from . import workers
-from .util import AudioDataSource
-from .io import player_for
-
-_AUDITOK_LOGGER = "AUDITOK_LOGGER"
-KeywordArguments = namedtuple(
- "KeywordArguments", ["io", "split", "miscellaneous"]
-)
-
-
-def make_kwargs(args_ns):
- if args_ns.save_stream is None:
- record = args_ns.plot or (args_ns.save_image is not None)
- else:
- record = False
- try:
- use_channel = int(args_ns.use_channel)
- except (ValueError, TypeError):
- use_channel = args_ns.use_channel
-
- io_kwargs = {
- "input": args_ns.input,
- "audio_format": args_ns.input_format,
- "max_read": args_ns.max_read,
- "block_dur": args_ns.analysis_window,
- "sampling_rate": args_ns.sampling_rate,
- "sample_width": args_ns.sample_width,
- "channels": args_ns.channels,
- "use_channel": use_channel,
- "save_stream": args_ns.save_stream,
- "save_detections_as": args_ns.save_detections_as,
- "export_format": args_ns.output_format,
- "large_file": args_ns.large_file,
- "frames_per_buffer": args_ns.frame_per_buffer,
- "input_device_index": args_ns.input_device_index,
- "record": record,
- }
-
- split_kwargs = {
- "min_dur": args_ns.min_duration,
- "max_dur": args_ns.max_duration,
- "max_silence": args_ns.max_silence,
- "drop_trailing_silence": args_ns.drop_trailing_silence,
- "strict_min_dur": args_ns.strict_min_duration,
- "energy_threshold": args_ns.energy_threshold,
- }
-
- miscellaneous = {
- "echo": args_ns.echo,
- "progress_bar": args_ns.progress_bar,
- "command": args_ns.command,
- "quiet": args_ns.quiet,
- "printf": args_ns.printf,
- "time_format": args_ns.time_format,
- "timestamp_format": args_ns.timestamp_format,
- }
- return KeywordArguments(io_kwargs, split_kwargs, miscellaneous)
-
-
-def make_logger(stderr=False, file=None, name=_AUDITOK_LOGGER):
- if not stderr and file is None:
- return None
- logger = logging.getLogger(name)
- logger.setLevel(logging.INFO)
- if stderr:
- handler = logging.StreamHandler(sys.stderr)
- handler.setLevel(logging.INFO)
- logger.addHandler(handler)
-
- if file is not None:
- handler = logging.FileHandler(file, "w")
- fmt = logging.Formatter("[%(asctime)s] | %(message)s")
- handler.setFormatter(fmt)
- handler.setLevel(logging.INFO)
- logger.addHandler(handler)
- return logger
-
-
-def initialize_workers(logger=None, **kwargs):
- observers = []
- reader = AudioDataSource(source=kwargs["input"], **kwargs)
- if kwargs["save_stream"] is not None:
- reader = workers.StreamSaverWorker(
- reader,
- filename=kwargs["save_stream"],
- export_format=kwargs["export_format"],
- )
- reader.start()
-
- if kwargs["save_detections_as"] is not None:
- worker = workers.RegionSaverWorker(
- kwargs["save_detections_as"],
- kwargs["export_format"],
- logger=logger,
- )
- observers.append(worker)
-
- if kwargs["echo"]:
- player = player_for(reader)
- worker = workers.PlayerWorker(
- player, progress_bar=kwargs["progress_bar"], logger=logger
- )
- observers.append(worker)
-
- if kwargs["command"] is not None:
- worker = workers.CommandLineWorker(
- command=kwargs["command"], logger=logger
- )
- observers.append(worker)
-
- if not kwargs["quiet"]:
- print_format = (
- kwargs["printf"]
- .replace("\\n", "\n")
- .replace("\\t", "\t")
- .replace("\\r", "\r")
- )
- worker = workers.PrintWorker(
- print_format, kwargs["time_format"], kwargs["timestamp_format"]
- )
- observers.append(worker)
-
- return reader, observers
diff --git a/libs/auditok/core.py b/libs/auditok/core.py
index af00dc7af..47441d2b7 100644
--- a/libs/auditok/core.py
+++ b/libs/auditok/core.py
@@ -1,1267 +1,264 @@
"""
-.. autosummary::
- :toctree: generated/
-
- load
- split
- AudioRegion
- StreamTokenizer
-"""
-import os
-import math
-from .util import AudioReader, DataValidator, AudioEnergyValidator
-from .io import check_audio_data, to_file, player_for, get_audio_source
-from .exceptions import TooSamllBlockDuration
-
-try:
- from . import signal_numpy as signal
-except ImportError:
- from . import signal
-
-__all__ = ["load", "split", "AudioRegion", "StreamTokenizer"]
-
-
-DEFAULT_ANALYSIS_WINDOW = 0.05
-DEFAULT_ENERGY_THRESHOLD = 50
-_EPSILON = 1e-10
-
-
-def load(input, skip=0, max_read=None, **kwargs):
- """Load audio data from a source and return it as an :class:`AudioRegion`.
-
- Parameters
- ----------
- input : None, str, bytes, AudioSource
- source to read audio data from. If `str`, it should be a path to a
- valid audio file. If `bytes`, it is used as raw audio data. If it is
- "-", raw data will be read from stdin. If None, read audio data from
- the microphone using PyAudio. If of type `bytes` or is a path to a
- raw audio file then `sampling_rate`, `sample_width` and `channels`
- parameters (or their alias) are required. If it's an
- :class:`AudioSource` object it's used directly to read data.
- skip : float, default: 0
- amount, in seconds, of audio data to skip from source. If read from
- a microphone, `skip` must be 0, otherwise a `ValueError` is raised.
- max_read : float, default: None
- amount, in seconds, of audio data to read from source. If read from
- microphone, `max_read` should not be None, otherwise a `ValueError` is
- raised.
- audio_format, fmt : str
- type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only
- be used if `input` is a string path to an audio file. If not given,
- audio type will be guessed from file name extension or from file
- header.
- sampling_rate, sr : int
- sampling rate of audio data. Required if `input` is a raw audio file,
- a `bytes` object or None (i.e., read from microphone).
- sample_width, sw : int
- number of bytes used to encode one audio sample, typically 1, 2 or 4.
- Required for raw data, see `sampling_rate`.
- channels, ch : int
- number of channels of audio data. Required for raw data, see
- `sampling_rate`.
- large_file : bool, default: False
- If True, AND if `input` is a path to a *wav* of a *raw* audio file
- (and **only** these two formats) then audio file is not fully loaded to
- memory in order to create the region (but the portion of data needed to
- create the region is of course loaded to memory). Set to True if
- `max_read` is significantly smaller then the size of a large audio file
- that shouldn't be entirely loaded to memory.
-
- Returns
- -------
- region: AudioRegion
-
- Raises
- ------
- ValueError
- raised if `input` is None (i.e., read data from microphone) and `skip`
- != 0 or `input` is None `max_read` is None (meaning that when reading
- from the microphone, no data should be skipped, and maximum amount of
- data to read should be explicitly provided).
- """
- return AudioRegion.load(input, skip, max_read, **kwargs)
-
-
-def split(
- input,
- min_dur=0.2,
- max_dur=5,
- max_silence=0.3,
- drop_trailing_silence=False,
- strict_min_dur=False,
- **kwargs
-):
- """
- Split audio data and return a generator of AudioRegions
-
- Parameters
- ----------
- input : str, bytes, AudioSource, AudioReader, AudioRegion or None
- input audio data. If str, it should be a path to an existing audio file.
- "-" is interpreted as standard input. If bytes, input is considered as
- raw audio data. If None, read audio from microphone.
- Every object that is not an `AudioReader` will be transformed into an
- `AudioReader` before processing. If it is an `str` that refers to a raw
- audio file, `bytes` or None, audio parameters should be provided using
- kwargs (i.e., `samplig_rate`, `sample_width` and `channels` or their
- alias).
- If `input` is str then audio format will be guessed from file extension.
- `audio_format` (alias `fmt`) kwarg can also be given to specify audio
- format explicitly. If none of these options is available, rely on
- backend (currently only pydub is supported) to load data.
- min_dur : float, default: 0.2
- minimun duration in seconds of a detected audio event. By using large
- values for `min_dur`, very short audio events (e.g., very short 1-word
- utterances like 'yes' or 'no') can be mis detected. Using very short
- values might result in a high number of short, unuseful audio events.
- max_dur : float, default: 5
- maximum duration in seconds of a detected audio event. If an audio event
- lasts more than `max_dur` it will be truncated. If the continuation of a
- truncated audio event is shorter than `min_dur` then this continuation
- is accepted as a valid audio event if `strict_min_dur` is False.
- Otherwise it is rejected.
- max_silence : float, default: 0.3
- maximum duration of continuous silence within an audio event. There
- might be many silent gaps of this duration within one audio event. If
- the continuous silence happens at the end of the event than it's kept as
- part of the event if `drop_trailing_silence` is False (default).
- drop_trailing_silence : bool, default: False
- Whether to remove trailing silence from detected events. To avoid abrupt
- cuts in speech, trailing silence should be kept, therefore this
- parameter should be False.
- strict_min_dur : bool, default: False
- strict minimum duration. Do not accept an audio event if it is shorter
- than `min_dur` even if it is contiguous to the latest valid event. This
- happens if the the latest detected event had reached `max_dur`.
-
- Other Parameters
- ----------------
- analysis_window, aw : float, default: 0.05 (50 ms)
- duration of analysis window in seconds. A value between 0.01 (10 ms) and
- 0.1 (100 ms) should be good for most use-cases.
- audio_format, fmt : str
- type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be
- used if `input` is a string path to an audio file. If not given, audio
- type will be guessed from file name extension or from file header.
- sampling_rate, sr : int
- sampling rate of audio data. Required if `input` is a raw audio file, is
- a bytes object or None (i.e., read from microphone).
- sample_width, sw : int
- number of bytes used to encode one audio sample, typically 1, 2 or 4.
- Required for raw data, see `sampling_rate`.
- channels, ch : int
- number of channels of audio data. Required for raw data, see
- `sampling_rate`.
- use_channel, uc : {None, "mix"} or int
- which channel to use for split if `input` has multiple audio channels.
- Regardless of which channel is used for splitting, returned audio events
- contain data from *all* channels, just as `input`.
- The following values are accepted:
-
- - None (alias "any"): accept audio activity from any channel, even if
- other channels are silent. This is the default behavior.
-
- - "mix" ("avg" or "average"): mix down all channels (i.e. compute
- average channel) and split the resulting channel.
-
- - int (0 <=, > `channels`): use one channel, specified by integer id,
- for split.
-
- large_file : bool, default: False
- If True, AND if `input` is a path to a *wav* of a *raw* audio file
- (and only these two formats) then audio data is lazily loaded to memory
- (i.e., one analysis window a time). Otherwise the whole file is loaded
- to memory before split. Set to True if the size of the file is larger
- than available memory.
- max_read, mr : float, default: None, read until end of stream
- maximum data to read from source in seconds.
- validator, val : callable, DataValidator
- custom data validator. If `None` (default), an `AudioEnergyValidor` is
- used with the given energy threshold. Can be a callable or an instance
- of `DataValidator` that implements `is_valid`. In either case, it'll be
- called with with a window of audio data as the first parameter.
- energy_threshold, eth : float, default: 50
- energy threshold for audio activity detection. Audio regions that have
- enough windows of with a signal energy equal to or above this threshold
- are considered valid audio events. Here we are referring to this amount
- as the energy of the signal but to be more accurate, it is the log
- energy of computed as: `20 * log10(sqrt(dot(x, x) / len(x)))` (see
- :class:`AudioEnergyValidator` and
- :func:`calculate_energy_single_channel`). If `validator` is given, this
- argument is ignored.
-
- Yields
- ------
- AudioRegion
- a generator of detected :class:`AudioRegion` s.
- """
- if min_dur <= 0:
- raise ValueError("'min_dur' ({}) must be > 0".format(min_dur))
- if max_dur <= 0:
- raise ValueError("'max_dur' ({}) must be > 0".format(max_dur))
- if max_silence < 0:
- raise ValueError("'max_silence' ({}) must be >= 0".format(max_silence))
-
- if isinstance(input, AudioReader):
- source = input
- analysis_window = source.block_dur
- else:
- analysis_window = kwargs.get(
- "analysis_window", kwargs.get("aw", DEFAULT_ANALYSIS_WINDOW)
- )
- if analysis_window <= 0:
- raise ValueError(
- "'analysis_window' ({}) must be > 0".format(analysis_window)
- )
-
- params = kwargs.copy()
- params["max_read"] = params.get("max_read", params.get("mr"))
- params["audio_format"] = params.get("audio_format", params.get("fmt"))
- if isinstance(input, AudioRegion):
- params["sampling_rate"] = input.sr
- params["sample_width"] = input.sw
- params["channels"] = input.ch
- input = bytes(input)
- try:
- source = AudioReader(input, block_dur=analysis_window, **params)
- except TooSamllBlockDuration as exc:
- err_msg = "Too small 'analysis_windows' ({0}) for sampling rate "
- err_msg += "({1}). Analysis windows should at least be 1/{1} to "
- err_msg += "cover one single data sample"
- raise ValueError(err_msg.format(exc.block_dur, exc.sampling_rate))
-
- validator = kwargs.get("validator", kwargs.get("val"))
- if validator is None:
- energy_threshold = kwargs.get(
- "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
- )
- use_channel = kwargs.get("use_channel", kwargs.get("uc"))
- validator = AudioEnergyValidator(
- energy_threshold, source.sw, source.ch, use_channel=use_channel
- )
- mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0
- if strict_min_dur:
- mode |= StreamTokenizer.STRICT_MIN_LENGTH
- min_length = _duration_to_nb_windows(min_dur, analysis_window, math.ceil)
- max_length = _duration_to_nb_windows(
- max_dur, analysis_window, math.floor, _EPSILON
- )
- max_continuous_silence = _duration_to_nb_windows(
- max_silence, analysis_window, math.floor, _EPSILON
- )
-
- err_msg = "({0} sec.) results in {1} analysis window(s) "
- err_msg += "({1} == {6}({0} / {2})) which is {5} the number "
- err_msg += "of analysis window(s) for 'max_dur' ({3} == floor({4} / {2}))"
- if min_length > max_length:
- err_msg = "'min_dur' " + err_msg
- raise ValueError(
- err_msg.format(
- min_dur,
- min_length,
- analysis_window,
- max_length,
- max_dur,
- "higher than",
- "ceil",
- )
- )
-
- if max_continuous_silence >= max_length:
- err_msg = "'max_silence' " + err_msg
- raise ValueError(
- err_msg.format(
- max_silence,
- max_continuous_silence,
- analysis_window,
- max_length,
- max_dur,
- "higher or equal to",
- "floor",
- )
- )
-
- tokenizer = StreamTokenizer(
- validator, min_length, max_length, max_continuous_silence, mode=mode
- )
- source.open()
- token_gen = tokenizer.tokenize(source, generator=True)
- region_gen = (
- _make_audio_region(
- token[0],
- token[1],
- source.block_dur,
- source.sr,
- source.sw,
- source.ch,
- )
- for token in token_gen
- )
- return region_gen
-
-
-def _duration_to_nb_windows(
- duration, analysis_window, round_fn=round, epsilon=0
-):
- """
- Converts a given duration into a positive integer of analysis windows.
- if `duration / analysis_window` is not an integer, the result will be
- rounded to the closest bigger integer. If `duration == 0`, returns `0`.
- If `duration < analysis_window`, returns 1.
- `duration` and `analysis_window` can be in seconds or milliseconds but
- must be in the same unit.
-
- Parameters
- ----------
- duration : float
- a given duration in seconds or ms.
- analysis_window: float
- size of analysis window, in the same unit as `duration`.
- round_fn : callable
- function called to round the result. Default: `round`.
- epsilon : float
- small value to add to the division result before rounding.
- E.g., `0.3 / 0.1 = 2.9999999999999996`, when called with
- `round_fn=math.floor` returns `2` instead of `3`. Adding a small value
- to `0.3 / 0.1` avoids this error.
-
- Returns
- -------
- nb_windows : int
- minimum number of `analysis_window`'s to cover `durartion`. That means
- that `analysis_window * nb_windows >= duration`.
- """
- if duration < 0 or analysis_window <= 0:
- err_msg = "'duration' ({}) must be >= 0 and 'analysis_window' ({}) > 0"
- raise ValueError(err_msg.format(duration, analysis_window))
- if duration == 0:
- return 0
- return int(round_fn(duration / analysis_window + epsilon))
-
-
-def _make_audio_region(
- data_frames,
- start_frame,
- frame_duration,
- sampling_rate,
- sample_width,
- channels,
-):
- """
- Helper function to create an `AudioRegion` from parameters returned by
- tokenization object. It takes care of setting up region `start` and `end`
- in metadata.
-
- Parameters
- ----------
- frame_duration: float
- duration of analysis window in seconds
- start_frame : int
- index of the fisrt analysis window
- samling_rate : int
- sampling rate of audio data
- sample_width : int
- number of bytes of one audio sample
- channels : int
- number of channels of audio data
-
- Returns
- -------
- audio_region : AudioRegion
- AudioRegion whose start time is calculeted as:
- `1000 * start_frame * frame_duration`
- """
- start = start_frame * frame_duration
- data = b"".join(data_frames)
- duration = len(data) / (sampling_rate * sample_width * channels)
- meta = {"start": start, "end": start + duration}
- return AudioRegion(data, sampling_rate, sample_width, channels, meta)
-
-
-def _read_chunks_online(max_read, **kwargs):
- """
- Helper function to read audio data from an online blocking source
- (i.e., microphone). Used to build an `AudioRegion` and can intercept
- KeyboardInterrupt so that reading stops as soon as this exception is
- raised. Makes building `AudioRegion`s on [i]python sessions and jupyter
- notebooks more user friendly.
-
- Parameters
- ----------
- max_read : float
- maximum amount of data to read in seconds.
- kwargs :
- audio parameters (sampling_rate, sample_width and channels).
-
- See also
- --------
- `AudioRegion.build`
- """
- reader = AudioReader(None, block_dur=0.5, max_read=max_read, **kwargs)
- reader.open()
- data = []
- try:
- while True:
- frame = reader.read()
- if frame is None:
- break
- data.append(frame)
- except KeyboardInterrupt:
- # Stop data acquisition from microphone when pressing
- # Ctrl+C on a [i]python session or a notebook
- pass
- reader.close()
- return (
- b"".join(data),
- reader.sampling_rate,
- reader.sample_width,
- reader.channels,
- )
-
-
-def _read_offline(input, skip=0, max_read=None, **kwargs):
- """
- Helper function to read audio data from an offline (i.e., file). Used to
- build `AudioRegion`s.
-
- Parameters
- ----------
- input : str, bytes
- path to audio file (if str), or a bytes object representing raw audio
- data.
- skip : float, default 0
- amount of data to skip from the begining of audio source.
- max_read : float, default: None
- maximum amount of audio data to read. Default: None, means read until
- end of stream.
- kwargs :
- audio parameters (sampling_rate, sample_width and channels).
-
- See also
- --------
- `AudioRegion.build`
-
- """
- audio_source = get_audio_source(input, **kwargs)
- audio_source.open()
- if skip is not None and skip > 0:
- skip_samples = round(skip * audio_source.sampling_rate)
- audio_source.read(skip_samples)
- if max_read is not None:
- if max_read < 0:
- max_read = None
- else:
- max_read = round(max_read * audio_source.sampling_rate)
- data = audio_source.read(max_read)
- audio_source.close()
- return (
- data,
- audio_source.sampling_rate,
- audio_source.sample_width,
- audio_source.channels,
- )
-
-
-def _check_convert_index(index, types, err_msg):
- if not isinstance(index, slice) or index.step is not None:
- raise TypeError(err_msg)
- start = index.start if index.start is not None else 0
- stop = index.stop
- for index in (start, stop):
- if index is not None and not isinstance(index, types):
- raise TypeError(err_msg)
- return start, stop
-
-
-class _SecondsView:
- """A class to create a view of `AudioRegion` that can be sliced using
- indices in seconds.
- """
-
- def __init__(self, region):
- self._region = region
-
- def __getitem__(self, index):
- err_msg = "Slicing AudioRegion by seconds requires indices of type "
- err_msg += "'int' or 'float' without a step (e.g. region.sec[7.5:10])"
- start_s, stop_s = _check_convert_index(index, (int, float), err_msg)
- sr = self._region.sampling_rate
- start_sample = int(start_s * sr)
- stop_sample = None if stop_s is None else round(stop_s * sr)
- return self._region[start_sample:stop_sample]
-
- @property
- def len(self):
- """
- Return region duration in seconds.
- """
- return self._region.duration
-
-
-class _MillisView(_SecondsView):
- """A class to create a view of `AudioRegion` that can be sliced using
- indices in milliseconds.
- """
-
- def __getitem__(self, index):
- err_msg = (
- "Slicing AudioRegion by milliseconds requires indices of type "
- )
- err_msg += "'int' without a step (e.g. region.sec[500:1500])"
- start_ms, stop_ms = _check_convert_index(index, (int), err_msg)
- start_sec = start_ms / 1000
- stop_sec = None if stop_ms is None else stop_ms / 1000
- index = slice(start_sec, stop_sec)
- return super(_MillisView, self).__getitem__(index)
-
- def __len__(self):
- """
- Return region duration in milliseconds.
- """
- return round(self._region.duration * 1000)
-
- @property
- def len(self):
- """
- Return region duration in milliseconds.
- """
- return len(self)
-
-
-class _AudioRegionMetadata(dict):
- """A class to store `AudioRegion`'s metadata."""
-
- def __getattr__(self, name):
- if name in self:
- return self[name]
- else:
- err_msg = "AudioRegion metadata has no entry '{}'"
- raise AttributeError(err_msg.format(name))
-
- def __setattr__(self, name, value):
- self[name] = value
-
- def __str__(self):
- return "\n".join("{}: {}".format(k, v) for k, v in self.items())
-
- def __repr__(self):
- return str(self)
-
-
-class AudioRegion(object):
- """
- AudioRegion encapsulates raw audio data and provides an interface to
- perform simple operations on it. Use `AudioRegion.load` to build an
- `AudioRegion` from different types of objects.
-
- Parameters
- ----------
- data : bytes
- raw audio data as a bytes object
- sampling_rate : int
- sampling rate of audio data
- sample_width : int
- number of bytes of one audio sample
- channels : int
- number of channels of audio data
- meta : dict, default: None
- any collection of <key:value> elements used to build metadata for
- this `AudioRegion`. Meta data can be accessed via `region.meta.key`
- if `key` is a valid python attribute name, or via `region.meta[key]`
- if not. Note that the :func:`split` function (or the
- :meth:`AudioRegion.split` method) returns `AudioRegions` with a ``start``
- and a ``stop`` meta values that indicate the location in seconds of the
- region in original audio data.
-
- See also
- --------
- AudioRegion.load
-
- """
-
- def __init__(self, data, sampling_rate, sample_width, channels, meta=None):
- check_audio_data(data, sample_width, channels)
- self._data = data
- self._sampling_rate = sampling_rate
- self._sample_width = sample_width
- self._channels = channels
- self._samples = None
- self.splitp = self.split_and_plot
-
- if meta is not None:
- self._meta = _AudioRegionMetadata(meta)
- else:
- self._meta = None
-
- self._seconds_view = _SecondsView(self)
- self.sec = self.seconds
- self.s = self.seconds
-
- self._millis_view = _MillisView(self)
- self.ms = self.millis
-
- @property
- def meta(self):
- return self._meta
-
- @meta.setter
- def meta(self, new_meta):
- """Meta data of audio region."""
- self._meta = _AudioRegionMetadata(new_meta)
-
- @classmethod
- def load(cls, input, skip=0, max_read=None, **kwargs):
- """
- Create an `AudioRegion` by loading data from `input`. See :func:`load`
- for parameters descripion.
-
- Returns
- -------
- region: AudioRegion
-
- Raises
- ------
- ValueError
- raised if `input` is None and `skip` != 0 or `max_read` is None.
- """
- if input is None:
- if skip > 0:
- raise ValueError(
- "'skip' should be 0 when reading from microphone"
- )
- if max_read is None or max_read < 0:
- raise ValueError(
- "'max_read' should not be None when reading from "
- "microphone"
- )
- data, sampling_rate, sample_width, channels = _read_chunks_online(
- max_read, **kwargs
- )
- else:
- data, sampling_rate, sample_width, channels = _read_offline(
- input, skip=skip, max_read=max_read, **kwargs
- )
-
- return cls(data, sampling_rate, sample_width, channels)
-
- @property
- def seconds(self):
- """
- A view to slice audio region by seconds (using ``region.seconds[start:end]``).
- """
- return self._seconds_view
-
- @property
- def millis(self):
- """A view to slice audio region by milliseconds (using ``region.millis[start:end]``)."""
- return self._millis_view
-
- @property
- def duration(self):
- """
- Returns region duration in seconds.
- """
- return len(self._data) / (
- self.sampling_rate * self.sample_width * self.channels
- )
-
- @property
- def sampling_rate(self):
- """Samling rate of audio data."""
- return self._sampling_rate
-
- @property
- def sr(self):
- """Samling rate of audio data, alias for `sampling_rate`."""
- return self._sampling_rate
-
- @property
- def sample_width(self):
- """Number of bytes per sample, one channel considered."""
- return self._sample_width
-
- @property
- def sw(self):
- """Number of bytes per sample, alias for `sampling_rate`."""
- return self._sample_width
-
- @property
- def channels(self):
- """Number of channels of audio data."""
- return self._channels
-
- @property
- def ch(self):
- """Number of channels of audio data, alias for `channels`."""
- return self._channels
-
- def play(self, progress_bar=False, player=None, **progress_bar_kwargs):
- """
- Play audio region.
-
- Parameters
- ----------
- progress_bar : bool, default: False
- whether to use a progress bar while playing audio. Default: False.
- `progress_bar` requires `tqdm`, if not installed, no progress bar
- will be shown.
- player : AudioPalyer, default: None
- audio player to use. if None (default), use `player_for()`
- to get a new audio player.
- progress_bar_kwargs : kwargs
- keyword arguments to pass to `tqdm` progress_bar builder (e.g.,
- use `leave=False` to clean up the screen when play finishes).
- """
- if player is None:
- player = player_for(self)
- player.play(
- self._data, progress_bar=progress_bar, **progress_bar_kwargs
- )
-
- def save(self, file, audio_format=None, exists_ok=True, **audio_parameters):
- """
- Save audio region to file.
-
- Parameters
- ----------
- file : str
- path to output audio file. May contain `{duration}` placeholder
- as well as any place holder that this region's metadata might
- contain (e.g., regions returned by `split` contain metadata with
- `start` and `end` attributes that can be used to build output file
- name as `{meta.start}` and `{meta.end}`. See examples using
- placeholders with formatting.
-
- audio_format : str, default: None
- format used to save audio data. If None (default), format is guessed
- from file name's extension. If file name has no extension, audio
- data is saved as a raw (headerless) audio file.
- exists_ok : bool, default: True
- If True, overwrite `file` if a file with the same name exists.
- If False, raise an `IOError` if `file` exists.
- audio_parameters: dict
- any keyword arguments to be passed to audio saving backend.
-
- Returns
- -------
- file: str
- name of output file with replaced placehoders.
- Raises
- IOError if `file` exists and `exists_ok` is False.
-
-
- Examples
- --------
- >>> region = AudioRegion(b'\\0' * 2 * 24000,
- >>> sampling_rate=16000,
- >>> sample_width=2,
- >>> channels=1)
- >>> region.meta.start = 2.25
- >>> region.meta.end = 2.25 + region.duration
- >>> region.save('audio_{meta.start}-{meta.end}.wav')
- >>> audio_2.25-3.75.wav
- >>> region.save('region_{meta.start:.3f}_{duration:.3f}.wav')
- audio_2.250_1.500.wav
- """
- if isinstance(file, str):
- file = file.format(duration=self.duration, meta=self.meta)
- if not exists_ok and os.path.exists(file):
- raise FileExistsError("file '{file}' exists".format(file=file))
- to_file(
- self._data,
- file,
- audio_format,
- sr=self.sr,
- sw=self.sw,
- ch=self.ch,
- audio_parameters=audio_parameters,
- )
- return file
-
- def split(
- self,
- min_dur=0.2,
- max_dur=5,
- max_silence=0.3,
- drop_trailing_silence=False,
- strict_min_dur=False,
- **kwargs
- ):
- """Split audio region. See :func:`auditok.split()` for a comprehensive
- description of split parameters.
- See Also :meth:`AudioRegio.split_and_plot`.
- """
- if kwargs.get("max_read", kwargs.get("mr")) is not None:
- warn_msg = "'max_read' (or 'mr') should not be used with "
- warn_msg += "AudioRegion.split_and_plot(). You should rather "
- warn_msg += "slice audio region before calling this method"
- raise RuntimeWarning(warn_msg)
- return split(
- self,
- min_dur=min_dur,
- max_dur=max_dur,
- max_silence=max_silence,
- drop_trailing_silence=drop_trailing_silence,
- strict_min_dur=strict_min_dur,
- **kwargs
- )
-
- def plot(
- self,
- scale_signal=True,
- show=True,
- figsize=None,
- save_as=None,
- dpi=120,
- theme="auditok",
- ):
- """Plot audio region, one sub-plot for each channel.
-
- Parameters
- ----------
- scale_signal : bool, default: True
- if true, scale signal by subtracting its mean and dividing by its
- standard deviation before plotting.
- show : bool
- whether to show plotted signal right after the call.
- figsize : tuple, default: None
- width and height of the figure to pass to `matplotlib`.
- save_as : str, default None.
- if provided, also save plot to file.
- dpi : int, default: 120
- plot dpi to pass to `matplotlib`.
- theme : str or dict, default: "auditok"
- plot theme to use. Currently only "auditok" theme is implemented. To
- provide you own them see :attr:`auditok.plotting.AUDITOK_PLOT_THEME`.
- """
- try:
- from auditok.plotting import plot
-
- plot(
- self,
- scale_signal=scale_signal,
- show=show,
- figsize=figsize,
- save_as=save_as,
- dpi=dpi,
- theme=theme,
- )
- except ImportError:
- raise RuntimeWarning("Plotting requires matplotlib")
-
- def split_and_plot(
- self,
- min_dur=0.2,
- max_dur=5,
- max_silence=0.3,
- drop_trailing_silence=False,
- strict_min_dur=False,
- scale_signal=True,
- show=True,
- figsize=None,
- save_as=None,
- dpi=120,
- theme="auditok",
- **kwargs
- ):
- """Split region and plot signal and detections. Alias: :meth:`splitp`.
- See :func:`auditok.split()` for a comprehensive description of split
- parameters. Also see :meth:`plot` for plot parameters.
- """
- try:
- from auditok.plotting import plot
-
- regions = self.split(
- min_dur=min_dur,
- max_dur=max_dur,
- max_silence=max_silence,
- drop_trailing_silence=drop_trailing_silence,
- strict_min_dur=strict_min_dur,
- **kwargs
- )
- regions = list(regions)
- detections = ((reg.meta.start, reg.meta.end) for reg in regions)
- eth = kwargs.get(
- "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
- )
- plot(
- self,
- scale_signal=scale_signal,
- detections=detections,
- energy_threshold=eth,
- show=show,
- figsize=figsize,
- save_as=save_as,
- dpi=dpi,
- theme=theme,
- )
- return regions
- except ImportError:
- raise RuntimeWarning("Plotting requires matplotlib")
+This module gathers processing (i.e. tokenization) classes.
- def __array__(self):
- return self.samples
-
- @property
- def samples(self):
- """Audio region as arrays of samples, one array per channel."""
- if self._samples is None:
- self._samples = signal.to_array(
- self._data, self.sample_width, self.channels
- )
- return self._samples
-
- def __len__(self):
- """
- Return region length in number of samples.
- """
- return len(self._data) // (self.sample_width * self.channels)
-
- @property
- def len(self):
- """
- Return region length in number of samples.
- """
- return len(self)
-
- def __bytes__(self):
- return self._data
-
- def __str__(self):
- return (
- "AudioRegion(duration={:.3f}, "
- "sampling_rate={}, sample_width={}, channels={})".format(
- self.duration, self.sr, self.sw, self.ch
- )
- )
-
- def __repr__(self):
- return str(self)
-
- def __add__(self, other):
- """
- Concatenates this region and `other` and return a new region.
- Both regions must have the same sampling rate, sample width
- and number of channels. If not, raises a `ValueError`.
- """
- if not isinstance(other, AudioRegion):
- raise TypeError(
- "Can only concatenate AudioRegion, "
- 'not "{}"'.format(type(other))
- )
- if other.sr != self.sr:
- raise ValueError(
- "Can only concatenate AudioRegions of the same "
- "sampling rate ({} != {})".format(self.sr, other.sr)
- )
- if other.sw != self.sw:
- raise ValueError(
- "Can only concatenate AudioRegions of the same "
- "sample width ({} != {})".format(self.sw, other.sw)
- )
- if other.ch != self.ch:
- raise ValueError(
- "Can only concatenate AudioRegions of the same "
- "number of channels ({} != {})".format(self.ch, other.ch)
- )
- data = self._data + other._data
- return AudioRegion(data, self.sr, self.sw, self.ch)
-
- def __radd__(self, other):
- """
- Concatenates `other` and this region. `other` should be an
- `AudioRegion` with the same audio parameters as this region
- but can exceptionally be `0` to make it possible to concatenate
- many regions with `sum`.
- """
- if other == 0:
- return self
- return other.add(self)
+Class summary
+=============
- def __mul__(self, n):
- if not isinstance(n, int):
- err_msg = "Can't multiply AudioRegion by a non-int of type '{}'"
- raise TypeError(err_msg.format(type(n)))
- data = self._data * n
- return AudioRegion(data, self.sr, self.sw, self.ch)
-
- def __rmul__(self, n):
- return self * n
-
- def __truediv__(self, n):
- if not isinstance(n, int) or n <= 0:
- raise TypeError("AudioRegion can only be divided by a positive int")
- samples_per_sub_region, rest = divmod(len(self), n)
- onset = 0
- sub_regions = []
- while onset < len(self):
- offset = 0
- if rest > 0:
- offset = 1
- rest -= 1
- offset += onset + samples_per_sub_region
- sub_regions.append(self[onset:offset])
- onset = offset
- return sub_regions
-
- def __eq__(self, other):
- if other is self:
- return True
- if not isinstance(other, AudioRegion):
- return False
- return (
- (self._data == other._data)
- and (self.sr == other.sr)
- and (self.sw == other.sw)
- and (self.ch == other.ch)
- )
-
- def __getitem__(self, index):
- err_msg = "Slicing AudioRegion by samples requires indices of type "
- err_msg += "'int' without a step (e.g. region.sec[1600:3200])"
- start_sample, stop_sample = _check_convert_index(index, (int), err_msg)
-
- bytes_per_sample = self.sample_width * self.channels
- len_samples = len(self._data) // bytes_per_sample
+.. autosummary::
- if start_sample < 0:
- start_sample = max(start_sample + len_samples, 0)
- onset = start_sample * bytes_per_sample
+ StreamTokenizer
+"""
- if stop_sample is not None:
- if stop_sample < 0:
- stop_sample = max(stop_sample + len_samples, 0)
- offset = index.stop * bytes_per_sample
- else:
- offset = None
+from auditok.util import DataValidator
- data = self._data[onset:offset]
- return AudioRegion(data, self.sr, self.sw, self.ch)
+__all__ = ["StreamTokenizer"]
-class StreamTokenizer:
+class StreamTokenizer():
"""
Class for stream tokenizers. It implements a 4-state automaton scheme
to extract sub-sequences of interest on the fly.
-
- Parameters
- ----------
- validator : callable, DataValidator (must implement `is_valid`)
- called with each data frame read from source. Should take one positional
- argument and return True or False for valid and invalid frames
- respectively.
-
- min_length : int
- Minimum number of frames of a valid token. This includes all
- tolerated non valid frames within the token.
-
- max_length : int
- Maximum number of frames of a valid token. This includes all
- tolerated non valid frames within the token.
-
- max_continuous_silence : int
- Maximum number of consecutive non-valid frames within a token.
- Note that, within a valid token, there may be many tolerated
- *silent* regions that contain each a number of non valid frames up
- to `max_continuous_silence`
-
- init_min : int
- Minimum number of consecutive valid frames that must be
- **initially** gathered before any sequence of non valid frames can
- be tolerated. This option is not always needed, it can be used to
- drop non-valid tokens as early as possible. **Default = 0** means
- that the option is by default ineffective.
-
- init_max_silence : int
- Maximum number of tolerated consecutive non-valid frames if the
- number already gathered valid frames has not yet reached
- 'init_min'.This argument is normally used if `init_min` is used.
- **Default = 0**, by default this argument is not taken into
- consideration.
-
- mode : int
- mode can be one of the following:
-
- -1 `StreamTokenizer.NORMAL` : do not drop trailing silence, and
- accept a token shorter than `min_length` if it is the continuation
- of the latest delivered token.
-
- -2 `StreamTokenizer.STRICT_MIN_LENGTH`: if token `i` is delivered
- because `max_length` is reached, and token `i+1` is immediately
- adjacent to token `i` (i.e. token `i` ends at frame `k` and token
- `i+1` starts at frame `k+1`) then accept token `i+1` only of it has
- a size of at least `min_length`. The default behavior is to accept
- token `i+1` event if it is shorter than `min_length` (provided that
- the above conditions are fulfilled of course).
-
- -3 `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing
- non-valid frames from a token to be delivered if and only if it
- is not **truncated**. This can be a bit tricky. A token is actually
- delivered if:
-
- - `max_continuous_silence` is reached.
-
- - Its length reaches `max_length`. This is referred to as a
- **truncated** token.
-
- In the current implementation, a `StreamTokenizer`'s decision is only
- based on already seen data and on incoming data. Thus, if a token is
- truncated at a non-valid but tolerated frame (`max_length` is reached
- but `max_continuous_silence` not yet) any tailing silence will be kept
- because it can potentially be part of valid token (if `max_length` was
- bigger). But if `max_continuous_silence` is reached before
- `max_length`, the delivered token will not be considered as truncated
- but a result of *normal* end of detection (i.e. no more valid data).
- In that case the trailing silence can be removed if you use the
- `StreamTokenizer.DROP_TRAILING_SILENCE` mode.
-
- -4 `(StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE)`:
- use both options. That means: first remove tailing silence, then
- check if the token still has a length of at least `min_length`.
-
-
-
-
- Examples
- --------
-
- In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
- accepted although it is shorter than `min_length` (3), because it
- immediately follows the latest delivered token:
-
- >>> from auditok.core import StreamTokenizer
- >>> from StringDataSource, DataValidator
-
- >>> class UpperCaseChecker(DataValidator):
- >>> def is_valid(self, frame):
- return frame.isupper()
- >>> dsource = StringDataSource("aaaAAAABBbbb")
- >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
- min_length=3,
- max_length=4,
- max_continuous_silence=0)
- >>> tokenizer.tokenize(dsource)
- [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
-
-
- The following tokenizer will however reject the 'BB' token:
-
- >>> dsource = StringDataSource("aaaAAAABBbbb")
- >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
- min_length=3, max_length=4,
- max_continuous_silence=0,
- mode=StreamTokenizer.STRICT_MIN_LENGTH)
- >>> tokenizer.tokenize(dsource)
- [(['A', 'A', 'A', 'A'], 3, 6)]
-
-
-
- >>> tokenizer = StreamTokenizer(
- >>> validator=UpperCaseChecker(),
- >>> min_length=3,
- >>> max_length=6,
- >>> max_continuous_silence=3,
- >>> mode=StreamTokenizer.DROP_TRAILING_SILENCE
- >>> )
- >>> dsource = StringDataSource("aaaAAAaaaBBbbbb")
- >>> tokenizer.tokenize(dsource)
- [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
-
- The first token is delivered with its tailing silence because it is
- truncated while the second one has its tailing frames removed.
-
- Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
-
- .. code:: python
-
- [
- (['A', 'A', 'A', 'a', 'a', 'a'], 3, 8),
- (['B', 'B', 'b', 'b', 'b'], 9, 13)
- ]
-
+
+ :Parameters:
+
+ `validator` :
+ instance of `DataValidator` that implements `is_valid` method.
+
+ `min_length` : *(int)*
+ Minimum number of frames of a valid token. This includes all \
+ tolerated non valid frames within the token.
+
+ `max_length` : *(int)*
+ Maximum number of frames of a valid token. This includes all \
+ tolerated non valid frames within the token.
+
+ `max_continuous_silence` : *(int)*
+ Maximum number of consecutive non-valid frames within a token.
+ Note that, within a valid token, there may be many tolerated \
+ *silent* regions that contain each a number of non valid frames up to \
+ `max_continuous_silence`
+
+ `init_min` : *(int, default=0)*
+ Minimum number of consecutive valid frames that must be **initially** \
+ gathered before any sequence of non valid frames can be tolerated. This
+ option is not always needed, it can be used to drop non-valid tokens as
+ early as possible. **Default = 0** means that the option is by default
+ ineffective.
+
+ `init_max_silence` : *(int, default=0)*
+ Maximum number of tolerated consecutive non-valid frames if the \
+ number already gathered valid frames has not yet reached 'init_min'.
+ This argument is normally used if `init_min` is used. **Default = 0**,
+ by default this argument is not taken into consideration.
+
+ `mode` : *(int, default=0)*
+ `mode` can be:
+
+ 1. `StreamTokenizer.STRICT_MIN_LENGTH`:
+ if token *i* is delivered because `max_length`
+ is reached, and token *i+1* is immediately adjacent to
+ token *i* (i.e. token *i* ends at frame *k* and token *i+1* starts
+ at frame *k+1*) then accept token *i+1* only of it has a size of at
+ least `min_length`. The default behavior is to accept token *i+1*
+ event if it is shorter than `min_length` (given that the above conditions
+ are fulfilled of course).
+
+ :Examples:
+
+ In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
+ accepted although it is shorter than `min_length` (3), because it immediately
+ follows the latest delivered token:
+
+ .. code:: python
+
+ from auditok import StreamTokenizer, StringDataSource, DataValidator
+
+ class UpperCaseChecker(DataValidator):
+ def is_valid(self, frame):
+ return frame.isupper()
+
+
+ dsource = StringDataSource("aaaAAAABBbbb")
+ tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
+ min_length=3,
+ max_length=4,
+ max_continuous_silence=0)
+
+ tokenizer.tokenize(dsource)
+
+
+ :output:
+
+ .. code:: python
+
+ [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
+
+
+ The following tokenizer will however reject the 'BB' token:
+
+ .. code:: python
+
+ dsource = StringDataSource("aaaAAAABBbbb")
+ tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
+ min_length=3, max_length=4,
+ max_continuous_silence=0,
+ mode=StreamTokenizer.STRICT_MIN_LENGTH)
+ tokenizer.tokenize(dsource)
+
+ :output:
+
+ .. code:: python
+
+ [(['A', 'A', 'A', 'A'], 3, 6)]
+
+
+ 2. `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing non-valid frames
+ from a token to be delivered if and only if it is not **truncated**.
+ This can be a bit tricky. A token is actually delivered if:
+
+ - a. `max_continuous_silence` is reached
+
+ :or:
+
+ - b. Its length reaches `max_length`. This is called a **truncated** token
+
+ In the current implementation, a `StreamTokenizer`'s decision is only based on already seen
+ data and on incoming data. Thus, if a token is truncated at a non-valid but tolerated
+ frame (`max_length` is reached but `max_continuous_silence` not yet) any tailing
+ silence will be kept because it can potentially be part of valid token (if `max_length`
+ was bigger). But if `max_continuous_silence` is reached before `max_length`, the delivered
+ token will not be considered as truncated but a result of *normal* end of detection
+ (i.e. no more valid data). In that case the tailing silence can be removed if you use
+ the `StreamTokenizer.DROP_TRAILING_SILENCE` mode.
+
+ :Example:
+
+ .. code:: python
+
+ tokenizer = StreamTokenizer(validator=UpperCaseChecker(), min_length=3,
+ max_length=6, max_continuous_silence=3,
+ mode=StreamTokenizer.DROP_TRAILING_SILENCE)
+
+ dsource = StringDataSource("aaaAAAaaaBBbbbb")
+ tokenizer.tokenize(dsource)
+
+ :output:
+
+ .. code:: python
+
+ [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
+
+ The first token is delivered with its tailing silence because it is truncated
+ while the second one has its tailing frames removed.
+
+ Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
+
+ .. code:: python
+
+ [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13)]
+
+
+
+ 3. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`:
+ use both options. That means: first remove tailing silence, then ckeck if the
+ token still has at least a length of `min_length`.
"""
-
+
+
SILENCE = 0
POSSIBLE_SILENCE = 1
- POSSIBLE_NOISE = 2
+ POSSIBLE_NOISE = 2
NOISE = 3
- NORMAL = 0
+
STRICT_MIN_LENGTH = 2
DROP_TRAILING_SILENCE = 4
-
- def __init__(
- self,
- validator,
- min_length,
- max_length,
- max_continuous_silence,
- init_min=0,
- init_max_silence=0,
- mode=0,
- ):
- if callable(validator):
- self._is_valid = validator
- elif isinstance(validator, DataValidator):
- self._is_valid = validator.is_valid
- else:
- raise TypeError(
- "'validator' must be a callable or an instance of "
- "DataValidator"
- )
-
+ # alias
+ DROP_TAILING_SILENCE = 4
+
+ def __init__(self, validator,
+ min_length, max_length, max_continuous_silence,
+ init_min=0, init_max_silence=0,
+ mode=0):
+
+ if not isinstance(validator, DataValidator):
+ raise TypeError("'validator' must be an instance of 'DataValidator'")
+
if max_length <= 0:
- raise ValueError(
- "'max_length' must be > 0 (value={0})".format(max_length)
- )
-
+ raise ValueError("'max_length' must be > 0 (value={0})".format(max_length))
+
if min_length <= 0 or min_length > max_length:
- err_msg = "'min_length' must be > 0 and <= 'max_length' (value={0})"
- raise ValueError(err_msg.format(min_length))
-
+ raise ValueError("'min_length' must be > 0 and <= 'max_length' (value={0})".format(min_length))
+
if max_continuous_silence >= max_length:
- err_msg = "'max_continuous_silence' must be < 'max_length' "
- err_msg += "(value={0})"
- raise ValueError(err_msg.format(max_continuous_silence))
-
+ raise ValueError("'max_continuous_silence' must be < 'max_length' (value={0})".format(max_continuous_silence))
+
if init_min >= max_length:
- raise ValueError(
- "'init_min' must be < 'max_length' (value={0})".format(
- max_continuous_silence
- )
- )
-
+ raise ValueError("'init_min' must be < 'max_length' (value={0})".format(max_continuous_silence))
+
self.validator = validator
self.min_length = min_length
self.max_length = max_length
self.max_continuous_silence = max_continuous_silence
self.init_min = init_min
self.init_max_silent = init_max_silence
- self._set_mode(mode)
+
+ self._mode = None
+ self.set_mode(mode)
+ self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
+ self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
+
self._deliver = None
self._tokens = None
self._state = None
self._data = None
self._contiguous_token = False
+
self._init_count = 0
self._silence_length = 0
self._start_frame = 0
self._current_frame = 0
-
- def _set_mode(self, mode):
- strict_min_and_drop_trailing = StreamTokenizer.STRICT_MIN_LENGTH
- strict_min_and_drop_trailing |= StreamTokenizer.DROP_TRAILING_SILENCE
- if mode not in [
- StreamTokenizer.NORMAL,
- StreamTokenizer.STRICT_MIN_LENGTH,
- StreamTokenizer.DROP_TRAILING_SILENCE,
- strict_min_and_drop_trailing,
- ]:
+
+ def set_mode(self, mode):
+ """
+ :Parameters:
+
+ `mode` : *(int)*
+ New mode, must be one of:
+
+
+ - `StreamTokenizer.STRICT_MIN_LENGTH`
+
+ - `StreamTokenizer.DROP_TRAILING_SILENCE`
+
+ - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`
+
+ - `0`
+
+ See `StreamTokenizer.__init__` for more information about the mode.
+ """
+
+ if not mode in [self.STRICT_MIN_LENGTH, self.DROP_TRAILING_SILENCE,
+ self.STRICT_MIN_LENGTH | self.DROP_TRAILING_SILENCE, 0]:
+
raise ValueError("Wrong value for mode")
+
self._mode = mode
self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
- self._drop_trailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
-
+ self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
+
+
+ def get_mode(self):
+ """
+ Return the current mode. To check whether a specific mode is activated use
+ the bitwise 'and' operator `&`. Example:
+
+ .. code:: python
+
+ if mode & self.STRICT_MIN_LENGTH != 0:
+ do_something()
+ """
+ return self._mode
+
def _reinitialize(self):
self._contiguous_token = False
self._data = []
@@ -1269,114 +266,112 @@ class StreamTokenizer:
self._state = self.SILENCE
self._current_frame = -1
self._deliver = self._append_token
-
- def tokenize(self, data_source, callback=None, generator=False):
+
+
+ def tokenize(self, data_source, callback=None):
"""
- Read data from `data_source`, one frame a time, and process the read
- frames in order to detect sequences of frames that make up valid
- tokens.
-
+ Read data from `data_source`, one frame a time, and process the read frames in
+ order to detect sequences of frames that make up valid tokens.
+
:Parameters:
- `data_source` : instance of the :class:`DataSource` class that
- implements a `read` method. 'read' should return a slice of
- signal, i.e. frame (of whatever type as long as it can be
- processed by validator) and None if there is no more signal.
-
+ `data_source` : instance of the :class:`DataSource` class that implements a `read` method.
+ 'read' should return a slice of signal, i.e. frame (of whatever \
+ type as long as it can be processed by validator) and None if \
+ there is no more signal.
+
`callback` : an optional 3-argument function.
- If a `callback` function is given, it will be called each time
- a valid token is found.
-
-
+ If a `callback` function is given, it will be called each time a valid token
+ is found.
+
+
:Returns:
- A list of tokens if `callback` is None. Each token is tuple with the
- following elements:
-
+ A list of tokens if `callback` is None. Each token is tuple with the following elements:
+
.. code python
-
+
(data, start, end)
-
- where `data` is a list of read frames, `start`: index of the first
- frame in the original data and `end` : index of the last frame.
+
+ where `data` is a list of read frames, `start`: index of the first frame in the
+ original data and `end` : index of the last frame.
+
"""
- token_gen = self._iter_tokens(data_source)
- if callback:
- for token in token_gen:
- callback(*token)
- return
- if generator:
- return token_gen
- return list(token_gen)
-
- def _iter_tokens(self, data_source):
+
self._reinitialize()
+
+ if callback is not None:
+ self._deliver = callback
+
while True:
- frame = data_source.read()
- self._current_frame += 1
+ frame = data_source.read()
if frame is None:
- token = self._post_process()
- if token is not None:
- yield token
break
- token = self._process(frame)
- if token is not None:
- yield token
-
- def _process(self, frame): # noqa: C901
-
- frame_is_valid = self._is_valid(frame)
-
+ self._current_frame += 1
+ self._process(frame)
+
+ self._post_process()
+
+ if callback is None:
+ _ret = self._tokens
+ self._tokens = None
+ return _ret
+
+
+ def _process(self, frame):
+
+ frame_is_valid = self.validator.is_valid(frame)
+
if self._state == self.SILENCE:
-
+
if frame_is_valid:
# seems we got a valid frame after a silence
self._init_count = 1
self._silence_length = 0
self._start_frame = self._current_frame
self._data.append(frame)
-
- if self._init_count >= self.init_min:
+
+ if self._init_count >= self.init_min:
self._state = self.NOISE
if len(self._data) >= self.max_length:
- return self._process_end_of_detection(True)
+ self._process_end_of_detection(True)
else:
self._state = self.POSSIBLE_NOISE
-
+
elif self._state == self.POSSIBLE_NOISE:
-
+
if frame_is_valid:
self._silence_length = 0
self._init_count += 1
self._data.append(frame)
- if self._init_count >= self.init_min:
+ if self._init_count >= self.init_min:
self._state = self.NOISE
if len(self._data) >= self.max_length:
- return self._process_end_of_detection(True)
-
- else:
+ self._process_end_of_detection(True)
+
+ else:
self._silence_length += 1
- if (
- self._silence_length > self.init_max_silent
- or len(self._data) + 1 >= self.max_length
- ):
+ if self._silence_length > self.init_max_silent or \
+ len(self._data) + 1 >= self.max_length:
# either init_max_silent or max_length is reached
# before _init_count, back to silence
self._data = []
self._state = self.SILENCE
else:
self._data.append(frame)
-
+
+
elif self._state == self.NOISE:
-
+
if frame_is_valid:
self._data.append(frame)
if len(self._data) >= self.max_length:
- return self._process_end_of_detection(True)
-
- elif self.max_continuous_silence <= 0:
- # max token reached at this frame will _deliver if
- # _contiguous_token and not _strict_min_length
+ self._process_end_of_detection(True)
+
+ elif self.max_continuous_silence <= 0 :
+ # max token reached at this frame will _deliver if _contiguous_token
+ # and not _strict_min_length
+ self._process_end_of_detection()
self._state = self.SILENCE
- return self._process_end_of_detection()
+
else:
# this is the first silent frame following a valid one
# and it is tolerated
@@ -1384,63 +379,61 @@ class StreamTokenizer:
self._data.append(frame)
self._state = self.POSSIBLE_SILENCE
if len(self._data) == self.max_length:
- return self._process_end_of_detection(True)
- # don't reset _silence_length because we still
+ self._process_end_of_detection(True)
+ # don't reset _silence_length because we still
# need to know the total number of silent frames
-
+
+
+
elif self._state == self.POSSIBLE_SILENCE:
-
+
if frame_is_valid:
self._data.append(frame)
self._silence_length = 0
self._state = self.NOISE
if len(self._data) >= self.max_length:
- return self._process_end_of_detection(True)
-
+ self._process_end_of_detection(True)
+
else:
if self._silence_length >= self.max_continuous_silence:
- self._state = self.SILENCE
if self._silence_length < len(self._data):
- # _deliver only gathered frames aren't all silent
- return self._process_end_of_detection()
- self._data = []
+ # _deliver only gathered frames aren't all silent
+ self._process_end_of_detection()
+ else:
+ self._data = []
+ self._state = self.SILENCE
self._silence_length = 0
else:
self._data.append(frame)
self._silence_length += 1
if len(self._data) >= self.max_length:
- return self._process_end_of_detection(True)
- # don't reset _silence_length because we still
+ self._process_end_of_detection(True)
+ # don't reset _silence_length because we still
# need to know the total number of silent frames
-
+
+
def _post_process(self):
if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE:
if len(self._data) > 0 and len(self._data) > self._silence_length:
- return self._process_end_of_detection()
-
+ self._process_end_of_detection()
+
+
def _process_end_of_detection(self, truncated=False):
-
- if (
- not truncated
- and self._drop_trailing_silence
- and self._silence_length > 0
- ):
+
+ if not truncated and self._drop_tailing_silence and self._silence_length > 0:
# happens if max_continuous_silence is reached
# or max_length is reached at a silent frame
- self._data = self._data[0 : -self._silence_length]
-
- if (len(self._data) >= self.min_length) or (
- len(self._data) > 0
- and not self._strict_min_length
- and self._contiguous_token
- ):
-
- start_frame = self._start_frame
- end_frame = self._start_frame + len(self._data) - 1
- data = self._data
- self._data = []
- token = (data, start_frame, end_frame)
-
+ self._data = self._data[0: - self._silence_length]
+
+ if (len(self._data) >= self.min_length) or \
+ (len(self._data) > 0 and \
+ not self._strict_min_length and self._contiguous_token):
+
+
+
+ _end_frame = self._start_frame + len(self._data) - 1
+ self._deliver(self._data, self._start_frame, _end_frame)
+
if truncated:
# next token (if any) will start at _current_frame + 1
self._start_frame = self._current_frame + 1
@@ -1448,11 +441,12 @@ class StreamTokenizer:
self._contiguous_token = True
else:
self._contiguous_token = False
- return token
else:
- self._contiguous_token = False
-
+ self._contiguous_token = False
+
self._data = []
-
+
+
+
def _append_token(self, data, start, end):
self._tokens.append((data, start, end))
diff --git a/libs/auditok/dataset.py b/libs/auditok/dataset.py
index 98dc5d1d4..1a3a7af5c 100644
--- a/libs/auditok/dataset.py
+++ b/libs/auditok/dataset.py
@@ -1,31 +1,19 @@
"""
-This module contains links to audio files that can be used for test purposes.
-
-.. autosummary::
- :toctree: generated/
-
- one_to_six_arabic_16000_mono_bc_noise
- was_der_mensch_saet_mono_44100_lead_trail_silence
+This module contains links to audio files you can use for test purposes.
"""
import os
-__all__ = [
- "one_to_six_arabic_16000_mono_bc_noise",
- "was_der_mensch_saet_mono_44100_lead_trail_silence",
-]
+__all__ = ["one_to_six_arabic_16000_mono_bc_noise", "was_der_mensch_saet_mono_44100_lead_trail_silence"]
_current_dir = os.path.dirname(os.path.realpath(__file__))
one_to_six_arabic_16000_mono_bc_noise = "{cd}{sep}data{sep}1to6arabic_\
-16000_mono_bc_noise.wav".format(
- cd=_current_dir, sep=os.path.sep
-)
+16000_mono_bc_noise.wav".format(cd=_current_dir, sep=os.path.sep)
"""A wave file that contains a pronunciation of Arabic numbers from 1 to 6"""
+
was_der_mensch_saet_mono_44100_lead_trail_silence = "{cd}{sep}data{sep}was_\
der_mensch_saet_das_wird_er_vielfach_ernten_44100Hz_mono_lead_trail_\
-silence.wav".format(
- cd=_current_dir, sep=os.path.sep
-)
-"""A wave file that contains a sentence with a long leading and trailing silence"""
+silence.wav".format(cd=_current_dir, sep=os.path.sep)
+""" A wave file that contains a sentence between long leading and trailing periods of silence""" \ No newline at end of file
diff --git a/libs/auditok/exceptions.py b/libs/auditok/exceptions.py
index 7bc5054ee..0026a9d89 100644
--- a/libs/auditok/exceptions.py
+++ b/libs/auditok/exceptions.py
@@ -1,41 +1,9 @@
+"""
+November 2015
+@author: Amine SEHILI <[email protected]>
+"""
+
class DuplicateArgument(Exception):
pass
-class TooSamllBlockDuration(ValueError):
- """Raised when block_dur results in a block_size smaller than one sample."""
-
- def __init__(self, message, block_dur, sampling_rate):
- self.block_dur = block_dur
- self.sampling_rate = sampling_rate
- super(TooSamllBlockDuration, self).__init__(message)
-
-
-class TimeFormatError(Exception):
- """Raised when a duration formatting directive is unknown."""
-
-
-class EndOfProcessing(Exception):
- """Raised within command line script's main function to jump to
- postprocessing code."""
-
-
-class AudioIOError(Exception):
- """Raised when a compressed audio file cannot be loaded or when trying
- to read from a not yet open AudioSource"""
-
-
-class AudioParameterError(AudioIOError):
- """Raised when one audio parameter is missing when loading raw data or
- saving data to a format other than raw. Also raised when an audio
- parameter has a wrong value."""
-
-
-class AudioEncodingError(Exception):
- """Raised if audio data can not be encoded in the provided format"""
-
-
-class AudioEncodingWarning(RuntimeWarning):
- """Raised if audio data can not be encoded in the provided format
- but saved as wav.
- """
diff --git a/libs/auditok/io.py b/libs/auditok/io.py
index b5fb61a76..665ab274d 100644
--- a/libs/auditok/io.py
+++ b/libs/auditok/io.py
@@ -1,1021 +1,499 @@
"""
Module for low-level audio input-output operations.
-.. autosummary::
- :toctree: generated/
+Class summary
+=============
- AudioSource
- Rewindable
- BufferAudioSource
- WaveAudioSource
- PyAudioSource
- StdinAudioSource
- PyAudioPlayer
- from_file
- to_file
- player_for
-"""
-import os
-import sys
-import wave
-import warnings
-from abc import ABC, abstractmethod
-from functools import partial
-from .exceptions import AudioIOError, AudioParameterError
+.. autosummary::
-try:
- from pydub import AudioSegment
+ AudioSource
+ Rewindable
+ BufferAudioSource
+ WaveAudioSource
+ PyAudioSource
+ StdinAudioSource
+ PyAudioPlayer
+
- _WITH_PYDUB = True
-except ImportError:
- _WITH_PYDUB = False
+Function summary
+================
-try:
- from tqdm import tqdm as _tqdm
+.. autosummary::
- DEFAULT_BAR_FORMAT_TQDM = "|" + "{bar}" + "|" + "[{elapsed}/{duration}]"
- DEFAULT_NCOLS_TQDM = 30
- DEFAULT_NCOLS_TQDM = 30
- DEFAULT_MIN_INTERVAL_TQDM = 0.05
- _WITH_TQDM = True
-except ImportError:
- _WITH_TQDM = False
+ from_file
+ player_for
+"""
+from abc import ABCMeta, abstractmethod
+import wave
+import sys
-__all__ = [
- "AudioSource",
- "Rewindable",
- "BufferAudioSource",
- "RawAudioSource",
- "WaveAudioSource",
- "PyAudioSource",
- "StdinAudioSource",
- "PyAudioPlayer",
- "from_file",
- "to_file",
- "player_for",
-]
+__all__ = ["AudioSource", "Rewindable", "BufferAudioSource", "WaveAudioSource",
+ "PyAudioSource", "StdinAudioSource", "PyAudioPlayer", "from_file", "player_for"]
-DEFAULT_SAMPLING_RATE = 16000
+DEFAULT_SAMPLE_RATE = 16000
DEFAULT_SAMPLE_WIDTH = 2
DEFAULT_NB_CHANNELS = 1
-def check_audio_data(data, sample_width, channels):
- sample_size_bytes = int(sample_width * channels)
- nb_samples = len(data) // sample_size_bytes
- if nb_samples * sample_size_bytes != len(data):
- raise AudioParameterError(
- "The length of audio data must be an integer "
- "multiple of `sample_width * channels`"
- )
-
-
-def _guess_audio_format(fmt, filename):
- if fmt is None:
- extension = os.path.splitext(filename.lower())[1][1:]
- if extension:
- fmt = extension
- else:
- return None
- fmt = fmt.lower()
- if fmt == "wave":
- fmt = "wav"
- return fmt
-
-
-def _get_audio_parameters(param_dict):
- """
- Get audio parameters from a dictionary of parameters. An audio parameter can
- have a long name or a short name. If the long name is present, the short
- name will be ignored. If neither is present then `AudioParameterError` is
- raised.
-
- Expected parameters are:
-
- - `sampling_rate`, `sr` : int, sampling rate.
-
- - `sample_width`, `sw` : int, sample size in bytes.
-
- - `channels`, `ch` : int, number of channels.
-
- Returns
- -------
- audio_parameters : tuple
- a tuple for audio parameters as (sampling_rate, sample_width, channels).
- """
- err_message = (
- "'{ln}' (or '{sn}') must be a positive integer, found: '{val}'"
- )
- parameters = []
- for (long_name, short_name) in (
- ("sampling_rate", "sr"),
- ("sample_width", "sw"),
- ("channels", "ch"),
- ):
- param = param_dict.get(long_name, param_dict.get(short_name))
- if param is None or not isinstance(param, int) or param <= 0:
- raise AudioParameterError(
- err_message.format(ln=long_name, sn=short_name, val=param)
- )
- parameters.append(param)
- sampling_rate, sample_width, channels = parameters
- return sampling_rate, sample_width, channels
-
-
-class AudioSource(ABC):
- """
+class AudioSource():
+ """
Base class for audio source objects.
-
- Subclasses should implement methods to open/close and audio stream
+
+ Subclasses should implement methods to open/close and audio stream
and read the desired amount of audio samples.
-
- Parameters
- ----------
- sampling_rate : int
- number of samples per second of audio data.
- sample_width : int
- size in bytes of one audio sample. Possible values: 1, 2 or 4.
- channels : int
- number of channels of audio data.
- """
-
- def __init__(
- self, sampling_rate, sample_width, channels,
- ):
-
- if sample_width not in (1, 2, 4):
- raise AudioParameterError(
- "Sample width must be one of: 1, 2 or 4 (bytes)"
- )
-
- self._sampling_rate = sampling_rate
- self._sample_width = sample_width
- self._channels = channels
-
+
+ :Parameters:
+
+ `sampling_rate` : int
+ Number of samples per second of audio stream. Default = 16000.
+
+ `sample_width` : int
+ Size in bytes of one audio sample. Possible values : 1, 2, 4.
+ Default = 2.
+
+ `channels` : int
+ Number of channels of audio stream. The current version supports
+ only mono audio streams (i.e. one channel).
+ """
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE,
+ sample_width = DEFAULT_SAMPLE_WIDTH,
+ channels = DEFAULT_NB_CHANNELS):
+
+ if not sample_width in (1, 2, 4):
+ raise ValueError("Sample width must be one of: 1, 2 or 4 (bytes)")
+
+ if channels != 1:
+ raise ValueError("Only mono audio is currently handled")
+
+ self.sampling_rate = sampling_rate
+ self.sample_width = sample_width
+ self.channels = channels
+
@abstractmethod
def is_open(self):
- """Return True if audio source is open, False otherwise."""
-
+ """ Return True if audio source is open, False otherwise """
+
@abstractmethod
def open(self):
- """Open audio source."""
-
+ """ Open audio source """
+
@abstractmethod
def close(self):
- """Close audio source."""
-
+ """ Close audio source """
+
@abstractmethod
def read(self, size):
"""
Read and return `size` audio samples at most.
-
- Parameters
- -----------
- size : int
- Number of samples to read.
-
- Returns
- -------
- data : bytes
- Audio data as a bytes object of length `N * sample_width * channels`
- where `N` equals:
-
- - `size` if `size` <= remaining samples
-
- - remaining samples if `size` > remaining samples
- """
-
- @property
- def sampling_rate(self):
- """Number of samples per second of audio stream."""
- return self._sampling_rate
-
- @property
- def sr(self):
- """Number of samples per second of audio stream (alias for
- `sampling_rate)`."""
- return self._sampling_rate
-
- @property
- def sample_width(self):
- """Number of bytes used to represent one audio sample."""
- return self._sample_width
-
- @property
- def sw(self):
- """Number of bytes used to represent one audio sample (alias for
- `sample_width`)."""
- return self._sample_width
-
- @property
- def channels(self):
- """Number of channels in audio stream."""
- return self._channels
-
- @property
- def ch(self):
- """Number of channels in audio stream (alias for `channels`)."""
+
+ :Parameters:
+
+ `size` : int
+ the number of samples to read.
+
+ :Returns:
+
+ Audio data as a string of length 'N' * 'smaple_width' * 'channels', where 'N' is:
+
+ - `size` if `size` < 'left_samples'
+
+ - 'left_samples' if `size` > 'left_samples'
+
+ """
+
+ def get_sampling_rate(self):
+ """ Return the number of samples per second of audio stream """
+ return self.sampling_rate
+
+ def get_sample_width(self):
+ """ Return the number of bytes used to represent one audio sample """
+ return self.sample_width
+
+ def get_channels(self):
+ """ Return the number of channels of this audio source """
return self.channels
+
-class Rewindable(AudioSource):
+class Rewindable():
"""
Base class for rewindable audio streams.
-
- Subclasses should implement a method to return back to the start of an the
- stream (`rewind`), as well as a property getter/setter named `position` that
- reads/sets stream position expressed in number of samples.
+ Subclasses should implement methods to return to the beginning of an
+ audio stream as well as method to move to an absolute audio position
+ expressed in time or in number of samples.
"""
-
+
+ __metaclass__ = ABCMeta
+
@abstractmethod
def rewind(self):
- """Go back to the beginning of audio stream."""
-
- @property
+ """ Go back to the beginning of audio stream """
+ pass
+
@abstractmethod
- def position(self):
- """Return stream position in number of samples."""
-
- @position.setter
+ def get_position(self):
+ """ Return the total number of already read samples """
+
@abstractmethod
- def position(self, position):
- """Set stream position in number of samples."""
-
- @property
- def position_s(self):
- """Return stream position in seconds."""
- return self.position / self.sampling_rate
-
- @position_s.setter
- def position_s(self, position_s):
- """Set stream position in seconds."""
- self.position = int(self.sampling_rate * position_s)
-
- @property
- def position_ms(self):
- """Return stream position in milliseconds."""
- return (self.position * 1000) // self.sampling_rate
-
- @position_ms.setter
- def position_ms(self, position_ms):
- """Set stream position in milliseconds."""
- if not isinstance(position_ms, int):
- raise ValueError("position_ms should be an int")
- self.position = int(self.sampling_rate * position_ms / 1000)
+ def get_time_position(self):
+ """ Return the total duration in seconds of already read data """
+
+ @abstractmethod
+ def set_position(self, position):
+ """ Move to an absolute position
+
+ :Parameters:
+
+ `position` : int
+ number of samples to skip from the start of the stream
+ """
+
+ @abstractmethod
+ def set_time_position(self, time_position):
+ """ Move to an absolute position expressed in seconds
+
+ :Parameters:
+
+ `time_position` : float
+ seconds to skip from the start of the stream
+ """
+ pass
+
-class BufferAudioSource(Rewindable):
+class BufferAudioSource(AudioSource, Rewindable):
"""
- An `AudioSource` that encapsulates and reads data from a memory buffer.
-
- This class implements the `Rewindable` interface.
- Parameters
- ----------
- data : bytes
- audio data
- sampling_rate : int, default: 16000
- number of samples per second of audio data.
- sample_width : int, default: 2
- size in bytes of one audio sample. Possible values: 1, 2 or 4.
- channels : int, default: 1
- number of channels of audio data.
+ An :class:`AudioSource` that encapsulates and reads data from a memory buffer.
+ It implements methods from :class:`Rewindable` and is therefore a navigable :class:`AudioSource`.
"""
-
- def __init__(
- self, data, sampling_rate=16000, sample_width=2, channels=1,
- ):
+
+ def __init__(self, data_buffer,
+ sampling_rate = DEFAULT_SAMPLE_RATE,
+ sample_width = DEFAULT_SAMPLE_WIDTH,
+ channels = DEFAULT_NB_CHANNELS):
+
+ if len(data_buffer) % (sample_width * channels) !=0:
+ raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)")
+
AudioSource.__init__(self, sampling_rate, sample_width, channels)
- check_audio_data(data, sample_width, channels)
- self._data = data
- self._sample_size_all_channels = sample_width * channels
- self._current_position_bytes = 0
+ self._buffer = data_buffer
+ self._index = 0
+ self._left = 0 if self._buffer is None else len(self._buffer)
self._is_open = False
-
+
def is_open(self):
return self._is_open
-
+
def open(self):
self._is_open = True
-
+
def close(self):
self._is_open = False
self.rewind()
-
+
def read(self, size):
if not self._is_open:
- raise AudioIOError("Stream is not open")
- if size is None or size < 0:
- offset = None
- else:
- bytes_to_read = self._sample_size_all_channels * size
- offset = self._current_position_bytes + bytes_to_read
- data = self._data[self._current_position_bytes : offset]
- if data:
- self._current_position_bytes += len(data)
+ raise IOError("Stream is not open")
+
+ if self._left > 0:
+
+ to_read = size * self.sample_width * self.channels
+ if to_read > self._left:
+ to_read = self._left
+
+ data = self._buffer[self._index: self._index + to_read]
+ self._index += to_read
+ self._left -= to_read
+
return data
+
return None
-
- @property
- def data(self):
- """Get raw audio data as a `bytes` object."""
- return self._data
-
+
+ def get_data_buffer(self):
+ """ Return all audio data as one string buffer. """
+ return self._buffer
+
+ def set_data(self, data_buffer):
+ """ Set new data for this audio stream.
+
+ :Parameters:
+
+ `data_buffer` : str, basestring, Bytes
+ a string buffer with a length multiple of (sample_width * channels)
+ """
+ if len(data_buffer) % (self.sample_width * self.channels) !=0:
+ raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)")
+ self._buffer = data_buffer
+ self._index = 0
+ self._left = 0 if self._buffer is None else len(self._buffer)
+
+ def append_data(self, data_buffer):
+ """ Append data to this audio stream
+
+ :Parameters:
+
+ `data_buffer` : str, basestring, Bytes
+ a buffer with a length multiple of (sample_width * channels)
+ """
+
+ if len(data_buffer) % (self.sample_width * self.channels) !=0:
+ raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)")
+
+ self._buffer += data_buffer
+ self._left += len(data_buffer)
+
+
def rewind(self):
- self.position = 0
-
- @property
- def position(self):
- """Get stream position in number of samples"""
- return self._current_position_bytes // self._sample_size_all_channels
-
- @position.setter
- def position(self, position):
- """Set stream position in number of samples."""
- position *= self._sample_size_all_channels
+ self.set_position(0)
+
+ def get_position(self):
+ return self._index / self.sample_width
+
+ def get_time_position(self):
+ return float(self._index) / (self.sample_width * self.sampling_rate)
+
+ def set_position(self, position):
if position < 0:
- position += len(self.data)
- if position < 0 or position > len(self.data):
- raise IndexError("Position out of range")
- self._current_position_bytes = position
+ raise ValueError("position must be >= 0")
+
+ if self._buffer is None:
+ self._index = 0
+ self._left = 0
+ return
+
+ position *= self.sample_width
+ self._index = position if position < len(self._buffer) else len(self._buffer)
+ self._left = len(self._buffer) - self._index
- @property
- def position_ms(self):
- """Get stream position in milliseconds."""
- return (self._current_position_bytes * 1000) // (
- self._sample_size_all_channels * self.sampling_rate
- )
- @position_ms.setter
- def position_ms(self, position_ms):
- """Set stream position in milliseconds."""
- if not isinstance(position_ms, int):
- raise ValueError("position_ms should be an int")
- self.position = int(self.sampling_rate * position_ms / 1000)
+ def set_time_position(self, time_position): # time in seconds
+ position = int(self.sampling_rate * time_position)
+ self.set_position(position)
-class FileAudioSource(AudioSource):
- """
- Base class `AudioSource`s that read audio data from a file.
- Parameters
- ----------
- sampling_rate : int, default: 16000
- number of samples per second of audio data.
- sample_width : int, default: 2
- size in bytes of one audio sample. Possible values: 1, 2 or 4.
- channels : int, default: 1
- number of channels of audio data.
+class WaveAudioSource(AudioSource):
"""
-
- def __init__(self, sampling_rate, sample_width, channels):
- AudioSource.__init__(self, sampling_rate, sample_width, channels)
+ A class for an `AudioSource` that reads data from a wave file.
+
+ :Parameters:
+
+ `filename` :
+ path to a valid wave file
+ """
+
+ def __init__(self, filename):
+
+ self._filename = filename
self._audio_stream = None
-
- def __del__(self):
- if self.is_open():
- self.close()
-
+
+ stream = wave.open(self._filename)
+ AudioSource.__init__(self, stream.getframerate(),
+ stream.getsampwidth(),
+ stream.getnchannels())
+ stream.close()
+
+
def is_open(self):
return self._audio_stream is not None
-
+
+ def open(self):
+ if(self._audio_stream is None):
+ self._audio_stream = wave.open(self._filename)
+
+
def close(self):
if self._audio_stream is not None:
self._audio_stream.close()
self._audio_stream = None
-
- @abstractmethod
- def _read_from_stream(self, size):
- """Read data from stream"""
-
+
+
def read(self, size):
- if not self.is_open():
- raise AudioIOError("Audio stream is not open")
- data = self._read_from_stream(size)
- if not data:
- return None
- return data
-
-
-class RawAudioSource(FileAudioSource):
- """
- A class for an `AudioSource` that reads data from a raw (headerless) audio
- file.
-
- This class should be used for large raw audio files to avoid loading the
- whole data to memory.
-
- Parameters
- ----------
- filename : str
- path to a raw audio file.
- sampling_rate : int
- Number of samples per second of audio data.
- sample_width : int
- Size in bytes of one audio sample. Possible values : 1, 2, 4.
- channels : int
- Number of channels of audio data.
- """
-
- def __init__(self, file, sampling_rate, sample_width, channels):
- FileAudioSource.__init__(self, sampling_rate, sample_width, channels)
- self._file = file
- self._audio_stream = None
- self._sample_size = sample_width * channels
-
- def open(self):
if self._audio_stream is None:
- self._audio_stream = open(self._file, "rb")
-
- def _read_from_stream(self, size):
- if size is None or size < 0:
- bytes_to_read = None
+ raise IOError("Stream is not open")
else:
- bytes_to_read = size * self._sample_size
- data = self._audio_stream.read(bytes_to_read)
- return data
-
-
-class WaveAudioSource(FileAudioSource):
- """
- A class for an `AudioSource` that reads data from a wave file.
-
- This class should be used for large wave files to avoid loading the whole
- data to memory.
-
- Parameters
- ----------
- filename : str
- path to a valid wave file.
- """
-
- def __init__(self, filename):
- self._filename = filename
- self._audio_stream = None
- stream = wave.open(self._filename, "rb")
- FileAudioSource.__init__(
- self,
- stream.getframerate(),
- stream.getsampwidth(),
- stream.getnchannels(),
- )
- stream.close()
-
- def open(self):
- if self._audio_stream is None:
- self._audio_stream = wave.open(self._filename)
-
- def _read_from_stream(self, size):
- if size is None or size < 0:
- size = -1
- return self._audio_stream.readframes(size)
+ data = self._audio_stream.readframes(size)
+ if data is None or len(data) < 1:
+ return None
+ return data
class PyAudioSource(AudioSource):
"""
- A class for an `AudioSource` that reads data from built-in microphone using
- PyAudio (https://people.csail.mit.edu/hubert/pyaudio/).
-
- Parameters
- ----------
- sampling_rate : int, default: 16000
- number of samples per second of audio data.
- sample_width : int, default: 2
- size in bytes of one audio sample. Possible values: 1, 2 or 4.
- channels : int, default: 1
- number of channels of audio data.
- frames_per_buffer : int, default: 1024
- PyAudio number of frames per buffer.
- input_device_index: None or int, default: None
- PyAudio index of audio device to read audio data from. If None default
- device is used.
+ A class for an `AudioSource` that reads data the built-in microphone using PyAudio.
"""
-
- def __init__(
- self,
- sampling_rate=16000,
- sample_width=2,
- channels=1,
- frames_per_buffer=1024,
- input_device_index=None,
- ):
-
+
+ def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE,
+ sample_width = DEFAULT_SAMPLE_WIDTH,
+ channels = DEFAULT_NB_CHANNELS,
+ frames_per_buffer = 1024):
+
+
AudioSource.__init__(self, sampling_rate, sample_width, channels)
self._chunk_size = frames_per_buffer
- self.input_device_index = input_device_index
-
+
import pyaudio
-
self._pyaudio_object = pyaudio.PyAudio()
- self._pyaudio_format = self._pyaudio_object.get_format_from_width(
- self.sample_width
- )
+ self._pyaudio_format = self._pyaudio_object.get_format_from_width(self.sample_width)
self._audio_stream = None
+
def is_open(self):
return self._audio_stream is not None
-
+
def open(self):
- self._audio_stream = self._pyaudio_object.open(
- format=self._pyaudio_format,
- channels=self.channels,
- rate=self.sampling_rate,
- input=True,
- output=False,
- input_device_index=self.input_device_index,
- frames_per_buffer=self._chunk_size,
- )
-
+ self._audio_stream = self._pyaudio_object.open(format = self._pyaudio_format,
+ channels = self.channels,
+ rate = self.sampling_rate,
+ input = True,
+ output = False,
+ frames_per_buffer = self._chunk_size)
+
+
def close(self):
if self._audio_stream is not None:
self._audio_stream.stop_stream()
self._audio_stream.close()
self._audio_stream = None
-
+
+
def read(self, size):
if self._audio_stream is None:
raise IOError("Stream is not open")
+
if self._audio_stream.is_active():
data = self._audio_stream.read(size)
if data is None or len(data) < 1:
return None
return data
+
return None
+
-
-class StdinAudioSource(FileAudioSource):
+class StdinAudioSource(AudioSource):
"""
- A class for an `AudioSource` that reads data from standard input.
-
- Parameters
- ----------
- sampling_rate : int, default: 16000
- number of samples per second of audio data.
- sample_width : int, default: 2
- size in bytes of one audio sample. Possible values: 1, 2 or 4.
- channels : int, default: 1
- number of channels of audio data.
+ A class for an :class:`AudioSource` that reads data from standard input.
"""
-
- def __init__(
- self, sampling_rate=16000, sample_width=2, channels=1,
- ):
- FileAudioSource.__init__(self, sampling_rate, sample_width, channels)
+
+ def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE,
+ sample_width = DEFAULT_SAMPLE_WIDTH,
+ channels = DEFAULT_NB_CHANNELS):
+
+ AudioSource.__init__(self, sampling_rate, sample_width, channels)
self._is_open = False
- self._sample_size = sample_width * channels
- self._stream = sys.stdin.buffer
-
+
+
def is_open(self):
return self._is_open
-
+
def open(self):
self._is_open = True
-
+
def close(self):
self._is_open = False
-
- def _read_from_stream(self, size):
- bytes_to_read = size * self._sample_size
- data = self._stream.read(bytes_to_read)
- if data:
- return data
- return None
-
-
-def _make_tqdm_progress_bar(iterable, total, duration, **tqdm_kwargs):
- fmt = tqdm_kwargs.get("bar_format", DEFAULT_BAR_FORMAT_TQDM)
- fmt = fmt.replace("{duration}", "{:.3f}".format(duration))
- tqdm_kwargs["bar_format"] = fmt
-
- tqdm_kwargs["ncols"] = tqdm_kwargs.get("ncols", DEFAULT_NCOLS_TQDM)
- tqdm_kwargs["mininterval"] = tqdm_kwargs.get(
- "mininterval", DEFAULT_MIN_INTERVAL_TQDM
- )
- return _tqdm(iterable, total=total, **tqdm_kwargs)
-
-
-class PyAudioPlayer:
+
+ def read(self, size):
+ if not self._is_open:
+ raise IOError("Stream is not open")
+
+ to_read = size * self.sample_width * self.channels
+ data = sys.stdin.read(to_read)
+
+ if data is None or len(data) < 1:
+ return None
+
+ return data
+
+
+class PyAudioPlayer():
"""
A class for audio playback using Pyaudio
- (https://people.csail.mit.edu/hubert/pyaudio/).
-
- Parameters
- ----------
- sampling_rate : int, default: 16000
- number of samples per second of audio data.
- sample_width : int, default: 2
- size in bytes of one audio sample. Possible values: 1, 2 or 4.
- channels : int, default: 1
- number of channels of audio data.
"""
-
- def __init__(
- self, sampling_rate=16000, sample_width=2, channels=1,
- ):
- if sample_width not in (1, 2, 4):
- raise ValueError("Sample width in bytes must be one of 1, 2 or 4")
-
+
+ def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE,
+ sample_width = DEFAULT_SAMPLE_WIDTH,
+ channels = DEFAULT_NB_CHANNELS):
+ if not sample_width in (1, 2, 4):
+ raise ValueError("Sample width must be one of: 1, 2 or 4 (bytes)")
+
self.sampling_rate = sampling_rate
self.sample_width = sample_width
self.channels = channels
-
+
import pyaudio
-
self._p = pyaudio.PyAudio()
- self.stream = self._p.open(
- format=self._p.get_format_from_width(self.sample_width),
- channels=self.channels,
- rate=self.sampling_rate,
- input=False,
- output=True,
- )
-
- def play(self, data, progress_bar=False, **progress_bar_kwargs):
- chunk_gen, nb_chunks = self._chunk_data(data)
- if progress_bar and _WITH_TQDM:
- duration = len(data) / (
- self.sampling_rate * self.sample_width * self.channels
- )
- chunk_gen = _make_tqdm_progress_bar(
- chunk_gen,
- total=nb_chunks,
- duration=duration,
- **progress_bar_kwargs
- )
+ self.stream = self._p.open(format = self._p.get_format_from_width(self.sample_width),
+ channels = self.channels, rate = self.sampling_rate,
+ input = False, output = True)
+
+ def play(self, data):
if self.stream.is_stopped():
self.stream.start_stream()
- try:
- for chunk in chunk_gen:
- self.stream.write(chunk)
- except KeyboardInterrupt:
- pass
+
+ for chunk in self._chunk_data(data):
+ self.stream.write(chunk)
+
self.stream.stop_stream()
-
- def stop(self):
+
+ def stop(self):
if not self.stream.is_stopped():
self.stream.stop_stream()
self.stream.close()
self._p.terminate()
-
+
def _chunk_data(self, data):
# make audio chunks of 100 ms to allow interruption (like ctrl+c)
- bytes_1_sec = self.sampling_rate * self.sample_width * self.channels
- chunk_size = bytes_1_sec // 10
- # make sure chunk_size is a multiple of sample_width * channels
- chunk_size -= chunk_size % (self.sample_width * self.channels)
- nb_chunks, rest = divmod(len(data), chunk_size)
- if rest > 0:
- nb_chunks += 1
- chunk_gen = (
- data[i : i + chunk_size] for i in range(0, len(data), chunk_size)
- )
- return chunk_gen, nb_chunks
-
-
-def player_for(source):
- """
- Return an `AudioPlayer` compatible with `source` (i.e., has the same
- sampling rate, sample width and number of channels).
-
- Parameters
- ----------
- source : AudioSource
- An object that has `sampling_rate`, `sample_width` and `sample_width`
- attributes.
-
- Returns
- -------
- player : PyAudioPlayer
- An audio player that has the same sampling rate, sample width
- and number of channels as `source`.
- """
- return PyAudioPlayer(
- source.sampling_rate, source.sample_width, source.channels
- )
-
-
-def get_audio_source(input=None, **kwargs):
- """
- Create and return an AudioSource from input.
-
- Parameters
- ----------
- input : str, bytes, "-" or None (default)
- source to read audio data from. If `str`, it should be a path to a valid
- audio file. If `bytes`, it is used as raw audio data. If it is "-",
- raw data will be read from stdin. If None, read audio data from the
- microphone using PyAudio.
- kwargs
- audio parameters used to build the `AudioSource` object. Depending on
- the nature of `input`, theses may be omitted (e.g., when `input` is an
- audio file in a popular audio format such as wav, ogg, flac, etc.) or
- include parameters such as `sampling_rate`, `sample_width`, `channels`
- (or their respective short name versions `sr`, `sw` and `ch`) if `input`
- is a path to a raw (headerless) audio file, a bytes object for raw audio
- data or None (to read data from built-in microphone). See the respective
- `AudioSource` classes from more information about possible parameters.
-
- Returns
- -------
- source : AudioSource
- audio source created from input parameters
- """
- if input == "-":
- return StdinAudioSource(*_get_audio_parameters(kwargs))
+ chunk_size = int((self.sampling_rate * self.sample_width * self.channels) / 10)
+ start = 0
+ while start < len(data):
+ yield data[start : start + chunk_size]
+ start += chunk_size
+
+
+def from_file(filename):
+ """
+ Create an `AudioSource` object using the audio file specified by `filename`.
+ The appropriate :class:`AudioSource` class is guessed from file's extension.
+
+ :Parameters:
+
+ `filename` :
+ path to an audio file.
+
+ :Returns:
+
+ an `AudioSource` object that reads data from the given file.
+
+ """
+
+ if filename.lower().endswith(".wav"):
+ return WaveAudioSource(filename)
+
+ raise Exception("Can not create an AudioSource object from '%s'" %(filename))
+
+
+def player_for(audio_source):
+ """
+ Return a :class:`PyAudioPlayer` that can play data from `audio_source`.
+
+ :Parameters:
+
+ `audio_source` :
+ an `AudioSource` object.
+
+ :Returns:
+
+ `PyAudioPlayer` that has the same sampling rate, sample width and number of channels
+ as `audio_source`.
+ """
+
+ return PyAudioPlayer(audio_source.get_sampling_rate(),
+ audio_source.get_sample_width(),
+ audio_source.get_channels())
+
+
- if isinstance(input, bytes):
- return BufferAudioSource(input, *_get_audio_parameters(kwargs))
-
- # read data from a file
- if input is not None:
- return from_file(filename=input, **kwargs)
-
- # read data from microphone via pyaudio
- else:
- frames_per_buffer = kwargs.get("frames_per_buffer", 1024)
- input_device_index = kwargs.get("input_device_index")
- return PyAudioSource(
- *_get_audio_parameters(kwargs),
- frames_per_buffer=frames_per_buffer,
- input_device_index=input_device_index
- )
-
-
-def _load_raw(file, sampling_rate, sample_width, channels, large_file=False):
- """
- Load a raw audio file with standard Python. If `large_file` is True, return
- a `RawAudioSource` object that reads data lazily from disk, otherwise load
- all data to memory and return a `BufferAudioSource` object.
-
- Parameters
- ----------
- file : str
- path to a raw audio data file.
- sampling_rate : int
- sampling rate of audio data.
- sample_width : int
- size in bytes of one audio sample.
- channels : int
- number of channels of audio data.
- large_file : bool
- if True, return a `RawAudioSource` otherwise a `BufferAudioSource`
- object.
-
- Returns
- -------
- source : RawAudioSource or BufferAudioSource
- an `AudioSource` that reads data from input file.
- """
- if None in (sampling_rate, sample_width, channels):
- raise AudioParameterError(
- "All audio parameters are required for raw audio files"
- )
-
- if large_file:
- return RawAudioSource(
- file,
- sampling_rate=sampling_rate,
- sample_width=sample_width,
- channels=channels,
- )
-
- with open(file, "rb") as fp:
- data = fp.read()
- return BufferAudioSource(
- data,
- sampling_rate=sampling_rate,
- sample_width=sample_width,
- channels=channels,
- )
-
-
-def _load_wave(file, large_file=False):
- """
- Load a wave audio file with standard Python. If `large_file` is True, return
- a `WaveAudioSource` object that reads data lazily from disk, otherwise load
- all data to memory and return a `BufferAudioSource` object.
-
- Parameters
- ----------
- file : str
- path to a wav audio data file
- large_file : bool
- if True, return a `WaveAudioSource` otherwise a `BufferAudioSource`
- object.
-
- Returns
- -------
- source : WaveAudioSource or BufferAudioSource
- an `AudioSource` that reads data from input file.
- """
- if large_file:
- return WaveAudioSource(file)
- with wave.open(file) as fp:
- channels = fp.getnchannels()
- srate = fp.getframerate()
- swidth = fp.getsampwidth()
- data = fp.readframes(-1)
- return BufferAudioSource(
- data, sampling_rate=srate, sample_width=swidth, channels=channels
- )
-
-
-def _load_with_pydub(file, audio_format=None):
- """
- Open compressed audio or video file using pydub. If a video file
- is passed, its audio track(s) are extracted and loaded.
-
- Parameters
- ----------
- file : str
- path to audio file.
- audio_format : str, default: None
- string, audio/video file format if known (e.g. raw, webm, wav, ogg)
-
- Returns
- -------
- source : BufferAudioSource
- an `AudioSource` that reads data from input file.
- """
- func_dict = {
- "mp3": AudioSegment.from_mp3,
- "ogg": AudioSegment.from_ogg,
- "flv": AudioSegment.from_flv,
- }
- open_function = func_dict.get(audio_format, AudioSegment.from_file)
- segment = open_function(file)
- return BufferAudioSource(
- data=segment.raw_data,
- sampling_rate=segment.frame_rate,
- sample_width=segment.sample_width,
- channels=segment.channels,
- )
-
-
-def from_file(filename, audio_format=None, large_file=False, **kwargs):
- """
- Read audio data from `filename` and return an `AudioSource` object.
- if `audio_format` is None, the appropriate `AudioSource` class is guessed
- from file's extension. `filename` can be a compressed audio or video file.
- This will require installing `pydub` (https://github.com/jiaaro/pydub).
-
- The normal behavior is to load all audio data to memory from which a
- :class:`BufferAudioSource` object is created. This should be convenient
- most of the time unless audio file is very large. In that case, and
- in order to load audio data in lazy manner (i.e. read data from disk each
- time :func:`AudioSource.read` is called), `large_file` should be True.
-
- Note that the current implementation supports only wave and raw formats for
- lazy audio loading.
-
- If an audio format is `raw`, the following keyword arguments are required:
-
- - `sampling_rate`, `sr`: int, sampling rate of audio data.
- - `sample_width`, `sw`: int, size in bytes of one audio sample.
- - `channels`, `ch`: int, number of channels of audio data.
-
- See also
- --------
- :func:`to_file`.
-
- Parameters
- ----------
- filename : str
- path to input audio or video file.
- audio_format : str
- audio format used to save data (e.g. raw, webm, wav, ogg).
- large_file : bool, default: False
- if True, audio won't fully be loaded to memory but only when a window
- is read from disk.
-
-
- Other Parameters
- ----------------
- sampling_rate, sr: int
- sampling rate of audio data
- sample_width : int
- sample width (i.e. number of bytes used to represent one audio sample)
- channels : int
- number of channels of audio data
-
- Returns
- -------
- audio_source : AudioSource
- an :class:`AudioSource` object that reads data from input file.
-
- Raises
- ------
- `AudioIOError`
- raised if audio data cannot be read in the given
- format or if `format` is `raw` and one or more audio parameters are missing.
- """
- audio_format = _guess_audio_format(audio_format, filename)
-
- if audio_format == "raw":
- srate, swidth, channels = _get_audio_parameters(kwargs)
- return _load_raw(filename, srate, swidth, channels, large_file)
-
- if audio_format in ["wav", "wave"]:
- return _load_wave(filename, large_file)
- if large_file:
- err_msg = "if 'large_file` is True file format should be raw or wav"
- raise AudioIOError(err_msg)
- if _WITH_PYDUB:
- return _load_with_pydub(filename, audio_format=audio_format)
- else:
- raise AudioIOError(
- "pydub is required for audio formats other than raw or wav"
- )
-
-
-def _save_raw(data, file):
- """
- Saves audio data as a headerless (i.e. raw) file.
- See also :func:`to_file`.
- """
- with open(file, "wb") as fp:
- fp.write(data)
-
-
-def _save_wave(data, file, sampling_rate, sample_width, channels):
- """
- Saves audio data to a wave file.
- See also :func:`to_file`.
- """
- if None in (sampling_rate, sample_width, channels):
- raise AudioParameterError(
- "All audio parameters are required to save wave audio files"
- )
- with wave.open(file, "w") as fp:
- fp.setframerate(sampling_rate)
- fp.setsampwidth(sample_width)
- fp.setnchannels(channels)
- fp.writeframes(data)
-
-
-def _save_with_pydub(
- data, file, audio_format, sampling_rate, sample_width, channels
-):
- """
- Saves audio data with pydub (https://github.com/jiaaro/pydub).
- See also :func:`to_file`.
- """
- segment = AudioSegment(
- data,
- frame_rate=sampling_rate,
- sample_width=sample_width,
- channels=channels,
- )
- with open(file, "wb") as fp:
- segment.export(fp, format=audio_format)
-
-
-def to_file(data, file, audio_format=None, **kwargs):
- """
- Writes audio data to file. If `audio_format` is `None`, output
- audio format will be guessed from extension. If `audio_format`
- is `None` and `file` comes without an extension then audio
- data will be written as a raw audio file.
-
- Parameters
- ----------
- data : bytes-like
- audio data to be written. Can be a `bytes`, `bytearray`,
- `memoryview`, `array` or `numpy.ndarray` object.
- file : str
- path to output audio file.
- audio_format : str
- audio format used to save data (e.g. raw, webm, wav, ogg)
- kwargs: dict
- If an audio format other than `raw` is used, the following keyword
- arguments are required:
-
- - `sampling_rate`, `sr`: int, sampling rate of audio data.
- - `sample_width`, `sw`: int, size in bytes of one audio sample.
- - `channels`, `ch`: int, number of channels of audio data.
-
- Raises
- ------
- `AudioParameterError` if output format is different than raw and one or more
- audio parameters are missing. `AudioIOError` if audio data cannot be written
- in the desired format.
- """
- audio_format = _guess_audio_format(audio_format, file)
- if audio_format in (None, "raw"):
- _save_raw(data, file)
- return
- try:
- sampling_rate, sample_width, channels = _get_audio_parameters(kwargs)
- except AudioParameterError as exc:
- err_message = "All audio parameters are required to save formats "
- "other than raw. Error detail: {}".format(exc)
- raise AudioParameterError(err_message)
- if audio_format in ("wav", "wave"):
- _save_wave(data, file, sampling_rate, sample_width, channels)
- elif _WITH_PYDUB:
- _save_with_pydub(
- data, file, audio_format, sampling_rate, sample_width, channels
- )
- else:
- err_message = "cannot write file format {} (file name: {})"
- raise AudioIOError(err_message.format(audio_format, file))
diff --git a/libs/auditok/plotting.py b/libs/auditok/plotting.py
deleted file mode 100644
index eca5877f4..000000000
--- a/libs/auditok/plotting.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import matplotlib.pyplot as plt
-import numpy as np
-
-AUDITOK_PLOT_THEME = {
- "figure": {"facecolor": "#482a36", "alpha": 0.2},
- "plot": {"facecolor": "#282a36"},
- "energy_threshold": {
- "color": "#e31f8f",
- "linestyle": "--",
- "linewidth": 1,
- },
- "signal": {"color": "#40d970", "linestyle": "-", "linewidth": 1},
- "detections": {
- "facecolor": "#777777",
- "edgecolor": "#ff8c1a",
- "linewidth": 1,
- "alpha": 0.75,
- },
-}
-
-
-def _make_time_axis(nb_samples, sampling_rate):
- sample_duration = 1 / sampling_rate
- x = np.linspace(0, sample_duration * (nb_samples - 1), nb_samples)
- return x
-
-
-def _plot_line(x, y, theme, xlabel=None, ylabel=None, **kwargs):
- color = theme.get("color", theme.get("c"))
- ls = theme.get("linestyle", theme.get("ls"))
- lw = theme.get("linewidth", theme.get("lw"))
- plt.plot(x, y, c=color, ls=ls, lw=lw, **kwargs)
- plt.xlabel(xlabel, fontsize=8)
- plt.ylabel(ylabel, fontsize=8)
-
-
-def _plot_detections(subplot, detections, theme):
- fc = theme.get("facecolor", theme.get("fc"))
- ec = theme.get("edgecolor", theme.get("ec"))
- ls = theme.get("linestyle", theme.get("ls"))
- lw = theme.get("linewidth", theme.get("lw"))
- alpha = theme.get("alpha")
- for (start, end) in detections:
- subplot.axvspan(start, end, fc=fc, ec=ec, ls=ls, lw=lw, alpha=alpha)
-
-
-def plot(
- audio_region,
- scale_signal=True,
- detections=None,
- energy_threshold=None,
- show=True,
- figsize=None,
- save_as=None,
- dpi=120,
- theme="auditok",
-):
- y = np.asarray(audio_region)
- if len(y.shape) == 1:
- y = y.reshape(1, -1)
- nb_subplots, nb_samples = y.shape
- sampling_rate = audio_region.sampling_rate
- time_axis = _make_time_axis(nb_samples, sampling_rate)
- if energy_threshold is not None:
- eth_log10 = energy_threshold * np.log(10) / 10
- amplitude_threshold = np.sqrt(np.exp(eth_log10))
- else:
- amplitude_threshold = None
- if detections is None:
- detections = []
- else:
- # End of detection corresponds to the end of the last sample but
- # to stay compatible with the time axis of signal plotting we want end
- # of detection to correspond to the *start* of the that last sample.
- detections = [
- (start, end - (1 / sampling_rate)) for (start, end) in detections
- ]
- if theme == "auditok":
- theme = AUDITOK_PLOT_THEME
-
- fig = plt.figure(figsize=figsize, dpi=dpi)
- fig_theme = theme.get("figure", theme.get("fig", {}))
- fig_fc = fig_theme.get("facecolor", fig_theme.get("ffc"))
- fig_alpha = fig_theme.get("alpha", 1)
- fig.patch.set_facecolor(fig_fc)
- fig.patch.set_alpha(fig_alpha)
-
- plot_theme = theme.get("plot", {})
- plot_fc = plot_theme.get("facecolor", plot_theme.get("pfc"))
-
- if nb_subplots > 2 and nb_subplots % 2 == 0:
- nb_rows = nb_subplots // 2
- nb_columns = 2
- else:
- nb_rows = nb_subplots
- nb_columns = 1
-
- for sid, samples in enumerate(y, 1):
- ax = fig.add_subplot(nb_rows, nb_columns, sid)
- ax.set_facecolor(plot_fc)
- if scale_signal:
- std = samples.std()
- if std > 0:
- mean = samples.mean()
- std = samples.std()
- samples = (samples - mean) / std
- max_ = samples.max()
- plt.ylim(-1.5 * max_, 1.5 * max_)
- if amplitude_threshold is not None:
- if scale_signal and std > 0:
- amp_th = (amplitude_threshold - mean) / std
- else:
- amp_th = amplitude_threshold
- eth_theme = theme.get("energy_threshold", theme.get("eth", {}))
- _plot_line(
- [time_axis[0], time_axis[-1]],
- [amp_th] * 2,
- eth_theme,
- label="Detection threshold",
- )
- if sid == 1:
- legend = plt.legend(
- ["Detection threshold"],
- facecolor=fig_fc,
- framealpha=0.1,
- bbox_to_anchor=(0.0, 1.15, 1.0, 0.102),
- loc=2,
- )
- legend = plt.gca().add_artist(legend)
-
- signal_theme = theme.get("signal", {})
- _plot_line(
- time_axis,
- samples,
- signal_theme,
- xlabel="Time (seconds)",
- ylabel="Signal{}".format(" (scaled)" if scale_signal else ""),
- )
- detections_theme = theme.get("detections", {})
- _plot_detections(ax, detections, detections_theme)
- plt.title("Channel {}".format(sid), fontsize=10)
-
- plt.xticks(fontsize=8)
- plt.yticks(fontsize=8)
- plt.tight_layout()
-
- if save_as is not None:
- plt.savefig(save_as, dpi=dpi)
- if show:
- plt.show()
diff --git a/libs/auditok/signal.py b/libs/auditok/signal.py
deleted file mode 100644
index 28a43bccf..000000000
--- a/libs/auditok/signal.py
+++ /dev/null
@@ -1,179 +0,0 @@
-"""
-Module for basic audio signal processing and array operations.
-
-.. autosummary::
- :toctree: generated/
-
- to_array
- extract_single_channel
- compute_average_channel
- compute_average_channel_stereo
- separate_channels
- calculate_energy_single_channel
- calculate_energy_multichannel
-"""
-from array import array as array_
-import audioop
-import math
-
-FORMAT = {1: "b", 2: "h", 4: "i"}
-_EPSILON = 1e-10
-
-
-def to_array(data, sample_width, channels):
- """Extract individual channels of audio data and return a list of arrays of
- numeric samples. This will always return a list of `array.array` objects
- (one per channel) even if audio data is mono.
-
- Parameters
- ----------
- data : bytes
- raw audio data.
- sample_width : int
- size in bytes of one audio sample (one channel considered).
-
- Returns
- -------
- samples_arrays : list
- list of arrays of audio samples.
- """
- fmt = FORMAT[sample_width]
- if channels == 1:
- return [array_(fmt, data)]
- return separate_channels(data, fmt, channels)
-
-
-def extract_single_channel(data, fmt, channels, selected):
- samples = array_(fmt, data)
- return samples[selected::channels]
-
-
-def compute_average_channel(data, fmt, channels):
- """
- Compute and return average channel of multi-channel audio data. If the
- number of channels is 2, use :func:`compute_average_channel_stereo` (much
- faster). This function uses satandard `array` module to convert `bytes` data
- into an array of numeric values.
-
- Parameters
- ----------
- data : bytes
- multi-channel audio data to mix down.
- fmt : str
- format (single character) to pass to `array.array` to convert `data`
- into an array of samples. This should be "b" if audio data's sample width
- is 1, "h" if it's 2 and "i" if it's 4.
- channels : int
- number of channels of audio data.
-
- Returns
- -------
- mono_audio : bytes
- mixed down audio data.
- """
- all_channels = array_(fmt, data)
- mono_channels = [
- array_(fmt, all_channels[ch::channels]) for ch in range(channels)
- ]
- avg_arr = array_(
- fmt,
- (round(sum(samples) / channels) for samples in zip(*mono_channels)),
- )
- return avg_arr
-
-
-def compute_average_channel_stereo(data, sample_width):
- """Compute and return average channel of stereo audio data. This function
- should be used when the number of channels is exactly 2 because in that
- case we can use standard `audioop` module which *much* faster then calling
- :func:`compute_average_channel`.
-
- Parameters
- ----------
- data : bytes
- 2-channel audio data to mix down.
- sample_width : int
- size in bytes of one audio sample (one channel considered).
-
- Returns
- -------
- mono_audio : bytes
- mixed down audio data.
- """
- fmt = FORMAT[sample_width]
- arr = array_(fmt, audioop.tomono(data, sample_width, 0.5, 0.5))
- return arr
-
-
-def separate_channels(data, fmt, channels):
- """Create a list of arrays of audio samples (`array.array` objects), one for
- each channel.
-
- Parameters
- ----------
- data : bytes
- multi-channel audio data to mix down.
- fmt : str
- format (single character) to pass to `array.array` to convert `data`
- into an array of samples. This should be "b" if audio data's sample width
- is 1, "h" if it's 2 and "i" if it's 4.
- channels : int
- number of channels of audio data.
-
- Returns
- -------
- channels_arr : list
- list of audio channels, each as a standard `array.array`.
- """
- all_channels = array_(fmt, data)
- mono_channels = [
- array_(fmt, all_channels[ch::channels]) for ch in range(channels)
- ]
- return mono_channels
-
-
-def calculate_energy_single_channel(data, sample_width):
- """Calculate the energy of mono audio data. Energy is computed as:
-
- .. math:: energy = 20 \log(\sqrt({1}/{N}\sum_{i}^{N}{a_i}^2)) % # noqa: W605
-
- where `a_i` is the i-th audio sample and `N` is the number of audio samples
- in data.
-
- Parameters
- ----------
- data : bytes
- single-channel audio data.
- sample_width : int
- size in bytes of one audio sample.
-
- Returns
- -------
- energy : float
- energy of audio signal.
- """
- energy_sqrt = max(audioop.rms(data, sample_width), _EPSILON)
- return 20 * math.log10(energy_sqrt)
-
-
-def calculate_energy_multichannel(x, sample_width, aggregation_fn=max):
- """Calculate the energy of multi-channel audio data. Energy is calculated
- channel-wise. An aggregation function is applied to the resulting energies
- (default: `max`). Also see :func:`calculate_energy_single_channel`.
-
- Parameters
- ----------
- data : bytes
- single-channel audio data.
- sample_width : int
- size in bytes of one audio sample (one channel considered).
- aggregation_fn : callable, default: max
- aggregation function to apply to the resulting per-channel energies.
-
- Returns
- -------
- energy : float
- aggregated energy of multi-channel audio signal.
- """
- energies = (calculate_energy_single_channel(xi, sample_width) for xi in x)
- return aggregation_fn(energies)
diff --git a/libs/auditok/signal_numpy.py b/libs/auditok/signal_numpy.py
deleted file mode 100644
index 385e333b1..000000000
--- a/libs/auditok/signal_numpy.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import numpy as np
-from .signal import (
- compute_average_channel_stereo,
- calculate_energy_single_channel,
- calculate_energy_multichannel,
-)
-
-FORMAT = {1: np.int8, 2: np.int16, 4: np.int32}
-
-
-def to_array(data, sample_width, channels):
- fmt = FORMAT[sample_width]
- if channels == 1:
- return np.frombuffer(data, dtype=fmt).astype(np.float64)
- return separate_channels(data, fmt, channels).astype(np.float64)
-
-
-def extract_single_channel(data, fmt, channels, selected):
- samples = np.frombuffer(data, dtype=fmt)
- return np.asanyarray(samples[selected::channels], order="C")
-
-
-def compute_average_channel(data, fmt, channels):
- array = np.frombuffer(data, dtype=fmt).astype(np.float64)
- return array.reshape(-1, channels).mean(axis=1).round().astype(fmt)
-
-
-def separate_channels(data, fmt, channels):
- array = np.frombuffer(data, dtype=fmt)
- return np.asanyarray(array.reshape(-1, channels).T, order="C")
diff --git a/libs/auditok/util.py b/libs/auditok/util.py
index f29eb9bf3..d46a8899c 100644
--- a/libs/auditok/util.py
+++ b/libs/auditok/util.py
@@ -1,624 +1,448 @@
"""
-.. autosummary::
- :toctree: generated/
-
- AudioEnergyValidator
- AudioReader
- Recorder
- make_duration_formatter
- make_channel_selector
-"""
-from abc import ABC, abstractmethod
-import warnings
-from functools import partial
-from .io import (
- AudioIOError,
- AudioSource,
- from_file,
- BufferAudioSource,
- PyAudioSource,
- get_audio_source,
-)
-from .exceptions import (
- DuplicateArgument,
- TooSamllBlockDuration,
- TimeFormatError,
-)
-
-try:
- from . import signal_numpy as signal
-except ImportError:
- from . import signal
-
-
-__all__ = [
- "make_duration_formatter",
- "make_channel_selector",
- "DataSource",
- "DataValidator",
- "StringDataSource",
- "ADSFactory",
- "AudioDataSource",
- "AudioReader",
- "Recorder",
- "AudioEnergyValidator",
-]
-
-
-def make_duration_formatter(fmt):
- """
- Make and return a function used to format durations in seconds. Accepted
- format directives are:
-
- - ``%S`` : absolute number of seconds with 3 decimals. This direction should
- be used alone.
- - ``%i`` : milliseconds
- - ``%s`` : seconds
- - ``%m`` : minutes
- - ``%h`` : hours
-
- These last 4 directives should all be specified. They can be placed anywhere
- in the input string.
-
- Parameters
- ----------
- fmt : str
- duration format.
-
- Returns
- -------
- formatter : callable
- a function that takes a duration in seconds (float) and returns a string
- that corresponds to that duration.
-
- Raises
- ------
- TimeFormatError
- if the format contains an unknown directive.
-
- Examples
- --------
-
- Using ``%S``:
-
- .. code:: python
-
- formatter = make_duration_formatter("%S")
- formatter(123.589)
- '123.589'
- formatter(123)
- '123.000'
-
- Using the other directives:
-
- .. code:: python
-
- formatter = make_duration_formatter("%h:%m:%s.%i")
- formatter(3600+120+3.25)
- '01:02:03.250'
-
- formatter = make_duration_formatter("%h hrs, %m min, %s sec and %i ms")
- formatter(3600+120+3.25)
- '01 hrs, 02 min, 03 sec and 250 ms'
-
- # omitting one of the 4 directives might result in a wrong duration
- formatter = make_duration_formatter("%m min, %s sec and %i ms")
- formatter(3600+120+3.25)
- '02 min, 03 sec and 250 ms'
- """
- if fmt == "%S":
-
- def fromatter(seconds):
- return "{:.3f}".format(seconds)
-
- elif fmt == "%I":
-
- def fromatter(seconds):
- return "{0}".format(int(seconds * 1000))
-
- else:
- fmt = fmt.replace("%h", "{hrs:02d}")
- fmt = fmt.replace("%m", "{mins:02d}")
- fmt = fmt.replace("%s", "{secs:02d}")
- fmt = fmt.replace("%i", "{millis:03d}")
- try:
- i = fmt.index("%")
- raise TimeFormatError(
- "Unknown time format directive '{0}'".format(fmt[i : i + 2])
- )
- except ValueError:
- pass
-
- def fromatter(seconds):
- millis = int(seconds * 1000)
- hrs, millis = divmod(millis, 3600000)
- mins, millis = divmod(millis, 60000)
- secs, millis = divmod(millis, 1000)
- return fmt.format(hrs=hrs, mins=mins, secs=secs, millis=millis)
-
- return fromatter
-
-
-def make_channel_selector(sample_width, channels, selected=None):
- """Create and return a callable used for audio channel selection. The
- returned selector can be used as `selector(audio_data)` and returns data
- that contains selected channel only.
-
- Importantly, if `selected` is None or equals "any", `selector(audio_data)`
- will separate and return a list of available channels:
- `[data_channe_1, data_channe_2, ...].`
-
- Note also that returned `selector` expects `bytes` format for input data but
- does notnecessarily return a `bytes` object. In fact, in order to extract
- the desired channel (or compute the average channel if `selected` = "avg"),
- it first converts input data into a `array.array` (or `numpy.ndarray`)
- object. After channel of interst is selected/computed, it is returned as
- such, without any reconversion to `bytes`. This behavior is wanted for
- efficiency purposes because returned objects can be directly used as buffers
- of bytes. In any case, returned objects can be converted back to `bytes`
- using `bytes(obj)`.
+Class summary
+=============
- Exception to this is the special case where `channels` = 1 in which input
- data is returned without any processing.
-
-
- Parameters
- ----------
- sample_width : int
- number of bytes used to encode one audio sample, should be 1, 2 or 4.
- channels : int
- number of channels of raw audio data that the returned selector should
- expect.
- selected : int or str, default: None
- audio channel to select and return when calling `selector(raw_data)`. It
- should be an int >= `-channels` and < `channels`. If one of "mix",
- "avg" or "average" is passed then `selector` will return the average
- channel of audio data. If None or "any", return a list of all available
- channels at each call.
-
- Returns
- -------
- selector : callable
- a callable that can be used as `selector(audio_data)` and returns data
- that contains channel of interst.
-
- Raises
- ------
- ValueError
- if `sample_width` is not one of 1, 2 or 4, or if `selected` has an
- unexpected value.
- """
- fmt = signal.FORMAT.get(sample_width)
- if fmt is None:
- err_msg = "'sample_width' must be 1, 2 or 4, given: {}"
- raise ValueError(err_msg.format(sample_width))
- if channels == 1:
- return lambda x: x
-
- if isinstance(selected, int):
- if selected < 0:
- selected += channels
- if selected < 0 or selected >= channels:
- err_msg = "Selected channel must be >= -channels and < channels"
- err_msg += ", given: {}"
- raise ValueError(err_msg.format(selected))
- return partial(
- signal.extract_single_channel,
- fmt=fmt,
- channels=channels,
- selected=selected,
- )
+.. autosummary::
- if selected in ("mix", "avg", "average"):
- if channels == 2:
- # when data is stereo, using audioop when possible is much faster
- return partial(
- signal.compute_average_channel_stereo,
- sample_width=sample_width,
- )
+ DataSource
+ StringDataSource
+ ADSFactory
+ ADSFactory.AudioDataSource
+ ADSFactory.ADSDecorator
+ ADSFactory.OverlapADS
+ ADSFactory.LimiterADS
+ ADSFactory.RecorderADS
+ DataValidator
+ AudioEnergyValidator
- return partial(
- signal.compute_average_channel, fmt=fmt, channels=channels
- )
+"""
- if selected in (None, "any"):
- return partial(signal.separate_channels, fmt=fmt, channels=channels)
- raise ValueError(
- "Selected channel must be an integer, None (alias 'any') or 'average' "
- "(alias 'avg' or 'mix')"
- )
+from abc import ABCMeta, abstractmethod
+import math
+from array import array
+from .io import Rewindable, from_file, BufferAudioSource, PyAudioSource
+from .exceptions import DuplicateArgument
+import sys
-class DataSource(ABC):
+try:
+ import numpy
+ _WITH_NUMPY = True
+except ImportError as e:
+ _WITH_NUMPY = False
+
+try:
+ from builtins import str
+ basestring = str
+except ImportError as e:
+ if sys.version_info >= (3, 0):
+ basestring = str
+
+
+
+__all__ = ["DataSource", "DataValidator", "StringDataSource", "ADSFactory", "AudioEnergyValidator"]
+
+
+class DataSource():
"""
- Base class for objects passed to :func:`StreamTokenizer.tokenize`.
+ Base class for objects passed to :func:`auditok.core.StreamTokenizer.tokenize`.
Subclasses should implement a :func:`DataSource.read` method.
"""
-
+ __metaclass__ = ABCMeta
+
@abstractmethod
def read(self):
"""
- Read a block (i.e., window) of data read from this source.
+ Read a piece of data read from this source.
If no more data is available, return None.
"""
-
-
-class DataValidator(ABC):
+
+
+class DataValidator():
"""
- Base class for a validator object used by :class:`.core.StreamTokenizer`
- to check if read data is valid.
+ Base class for a validator object used by :class:`.core.StreamTokenizer` to check
+ if read data is valid.
Subclasses should implement :func:`is_valid` method.
"""
-
+ __metaclass__ = ABCMeta
+
@abstractmethod
def is_valid(self, data):
"""
Check whether `data` is valid
"""
-
-class AudioEnergyValidator(DataValidator):
- """
- A validator based on audio signal energy. For an input window of `N` audio
- samples (see :func:`AudioEnergyValidator.is_valid`), the energy is computed
- as:
-
- .. math:: energy = 20 \log(\sqrt({1}/{N}\sum_{i}^{N}{a_i}^2)) % # noqa: W605
-
- where `a_i` is the i-th audio sample.
-
- Parameters
- ----------
- energy_threshold : float
- minimum energy that audio window should have to be valid.
- sample_width : int
- size in bytes of one audio sample.
- channels : int
- number of channels of audio data.
- use_channel : {None, "any", "mix", "avg", "average"} or int
- channel to use for energy computation. The following values are
- accepted:
-
- - None (alias "any") : compute energy for each of the channels and return
- the maximum value.
- - "mix" (alias "avg" or "average") : compute the average channel then
- compute its energy.
- - int (>= 0 , < `channels`) : compute the energy of the specified channel
- and ignore the other ones.
-
- Returns
- -------
- energy : float
- energy of the audio window.
- """
-
- def __init__(
- self, energy_threshold, sample_width, channels, use_channel=None
- ):
- self._sample_width = sample_width
- self._selector = make_channel_selector(
- sample_width, channels, use_channel
- )
- if channels == 1 or use_channel not in (None, "any"):
- self._energy_fn = signal.calculate_energy_single_channel
- else:
- self._energy_fn = signal.calculate_energy_multichannel
- self._energy_threshold = energy_threshold
-
- def is_valid(self, data):
- """
-
- Parameters
- ----------
- data : bytes-like
- array of raw audio data
-
- Returns
- -------
- bool
- True if the energy of audio data is >= threshold, False otherwise.
- """
- log_energy = self._energy_fn(self._selector(data), self._sample_width)
- return log_energy >= self._energy_threshold
-
-
class StringDataSource(DataSource):
"""
- Class that represent a :class:`DataSource` as a string buffer.
- Each call to :func:`DataSource.read` returns on character and moves one
- step forward. If the end of the buffer is reached, :func:`read` returns
- None.
-
- Parameters
- ----------
- data : str
- a string object used as data.
-
+ A class that represent a :class:`DataSource` as a string buffer.
+ Each call to :func:`DataSource.read` returns on character and moves one step forward.
+ If the end of the buffer is reached, :func:`read` returns None.
+
+ :Parameters:
+
+ `data` :
+ a basestring object.
+
"""
-
+
def __init__(self, data):
self._data = None
self._current = 0
self.set_data(data)
-
+
+
def read(self):
"""
Read one character from buffer.
-
- Returns
- -------
- char : str
- current character or None if end of buffer is reached.
+
+ :Returns:
+
+ Current character or None if end of buffer is reached
"""
-
+
if self._current >= len(self._data):
return None
self._current += 1
return self._data[self._current - 1]
-
+
def set_data(self, data):
"""
Set a new data buffer.
-
- Parameters
- ----------
- data : str
- new data buffer.
+
+ :Parameters:
+
+ `data` : a basestring object
+ New data buffer.
"""
-
- if not isinstance(data, str):
- raise ValueError("data must an instance of str")
+
+ if not isinstance(data, basestring):
+ raise ValueError("data must an instance of basestring")
self._data = data
self._current = 0
+
class ADSFactory:
"""
- .. deprecated:: 2.0.0
- `ADSFactory` will be removed in auditok 2.0.1, use instances of
- :class:`AudioReader` instead.
-
- Factory class that makes it easy to create an
- :class:`AudioDataSource` object that implements
- :class:`DataSource` and can therefore be passed to
- :func:`auditok.core.StreamTokenizer.tokenize`.
-
- Whether you read audio data from a file, the microphone or a memory buffer,
- this factory instantiates and returns the right
- :class:`AudioDataSource` object.
-
- There are many other features you want a :class:`AudioDataSource` object to
- have, such as: memorize all read audio data so that you can rewind and reuse
- it (especially useful when reading data from the microphone), read a fixed
- amount of data (also useful when reading from the microphone), read
- overlapping audio frames (often needed when dosing a spectral analysis of
- data).
-
- :func:`ADSFactory.ads` automatically creates and return object with the
- desired behavior according to the supplied keyword arguments.
+ Factory class that makes it easy to create an :class:`ADSFactory.AudioDataSource` object that implements
+ :class:`DataSource` and can therefore be passed to :func:`auditok.core.StreamTokenizer.tokenize`.
+
+ Whether you read audio data from a file, the microphone or a memory buffer, this factory
+ instantiates and returns the right :class:`ADSFactory.AudioDataSource` object.
+
+ There are many other features you want your :class:`ADSFactory.AudioDataSource` object to have, such as:
+ memorize all read audio data so that you can rewind and reuse it (especially useful when
+ reading data from the microphone), read a fixed amount of data (also useful when reading
+ from the microphone), read overlapping audio frames (often needed when dosing a spectral
+ analysis of data).
+
+ :func:`ADSFactory.ads` automatically creates and return object with the desired behavior according
+ to the supplied keyword arguments.
+
"""
-
- @staticmethod # noqa: C901
+
+ @staticmethod
def _check_normalize_args(kwargs):
-
+
for k in kwargs:
- if k not in [
- "block_dur",
- "hop_dur",
- "block_size",
- "hop_size",
- "max_time",
- "record",
- "audio_source",
- "filename",
- "data_buffer",
- "frames_per_buffer",
- "sampling_rate",
- "sample_width",
- "channels",
- "sr",
- "sw",
- "ch",
- "asrc",
- "fn",
- "fpb",
- "db",
- "mt",
- "rec",
- "bd",
- "hd",
- "bs",
- "hs",
- ]:
+ if not k in ["block_dur", "hop_dur", "block_size", "hop_size", "max_time", "record",
+ "audio_source", "filename", "data_buffer", "frames_per_buffer", "sampling_rate",
+ "sample_width", "channels", "sr", "sw", "ch", "asrc", "fn", "fpb", "db", "mt",
+ "rec", "bd", "hd", "bs", "hs"]:
raise ValueError("Invalid argument: {0}".format(k))
-
+
if "block_dur" in kwargs and "bd" in kwargs:
- raise DuplicateArgument(
- "Either 'block_dur' or 'bd' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'block_dur' or 'bd' must be specified, not both")
+
if "hop_dur" in kwargs and "hd" in kwargs:
- raise DuplicateArgument(
- "Either 'hop_dur' or 'hd' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'hop_dur' or 'hd' must be specified, not both")
+
if "block_size" in kwargs and "bs" in kwargs:
- raise DuplicateArgument(
- "Either 'block_size' or 'bs' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'block_size' or 'bs' must be specified, not both")
+
if "hop_size" in kwargs and "hs" in kwargs:
- raise DuplicateArgument(
- "Either 'hop_size' or 'hs' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'hop_size' or 'hs' must be specified, not both")
+
if "max_time" in kwargs and "mt" in kwargs:
- raise DuplicateArgument(
- "Either 'max_time' or 'mt' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'max_time' or 'mt' must be specified, not both")
+
if "audio_source" in kwargs and "asrc" in kwargs:
- raise DuplicateArgument(
- "Either 'audio_source' or 'asrc' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'audio_source' or 'asrc' must be specified, not both")
+
if "filename" in kwargs and "fn" in kwargs:
- raise DuplicateArgument(
- "Either 'filename' or 'fn' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'filename' or 'fn' must be specified, not both")
+
if "data_buffer" in kwargs and "db" in kwargs:
- raise DuplicateArgument(
- "Either 'filename' or 'db' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'filename' or 'db' must be specified, not both")
+
if "frames_per_buffer" in kwargs and "fbb" in kwargs:
- raise DuplicateArgument(
- "Either 'frames_per_buffer' or 'fpb' must be specified, not "
- "both"
- )
-
+ raise DuplicateArgument("Either 'frames_per_buffer' or 'fpb' must be specified, not both")
+
if "sampling_rate" in kwargs and "sr" in kwargs:
- raise DuplicateArgument(
- "Either 'sampling_rate' or 'sr' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'sampling_rate' or 'sr' must be specified, not both")
+
if "sample_width" in kwargs and "sw" in kwargs:
- raise DuplicateArgument(
- "Either 'sample_width' or 'sw' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'sample_width' or 'sw' must be specified, not both")
+
if "channels" in kwargs and "ch" in kwargs:
- raise DuplicateArgument(
- "Either 'channels' or 'ch' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'channels' or 'ch' must be specified, not both")
+
if "record" in kwargs and "rec" in kwargs:
- raise DuplicateArgument(
- "Either 'record' or 'rec' must be specified, not both"
- )
-
+ raise DuplicateArgument("Either 'record' or 'rec' must be specified, not both")
+
+
kwargs["bd"] = kwargs.pop("block_dur", None) or kwargs.pop("bd", None)
kwargs["hd"] = kwargs.pop("hop_dur", None) or kwargs.pop("hd", None)
kwargs["bs"] = kwargs.pop("block_size", None) or kwargs.pop("bs", None)
kwargs["hs"] = kwargs.pop("hop_size", None) or kwargs.pop("hs", None)
kwargs["mt"] = kwargs.pop("max_time", None) or kwargs.pop("mt", None)
- kwargs["asrc"] = kwargs.pop("audio_source", None) or kwargs.pop(
- "asrc", None
- )
+ kwargs["asrc"] = kwargs.pop("audio_source", None) or kwargs.pop("asrc", None)
kwargs["fn"] = kwargs.pop("filename", None) or kwargs.pop("fn", None)
kwargs["db"] = kwargs.pop("data_buffer", None) or kwargs.pop("db", None)
-
+
record = kwargs.pop("record", False)
if not record:
record = kwargs.pop("rec", False)
if not isinstance(record, bool):
raise TypeError("'record' must be a boolean")
-
+
kwargs["rec"] = record
-
- # keep long names for arguments meant for BufferAudioSource
- # and PyAudioSource
+
+ # keep long names for arguments meant for BufferAudioSource and PyAudioSource
if "frames_per_buffer" in kwargs or "fpb" in kwargs:
- kwargs["frames_per_buffer"] = kwargs.pop(
- "frames_per_buffer", None
- ) or kwargs.pop("fpb", None)
-
+ kwargs["frames_per_buffer"] = kwargs.pop("frames_per_buffer", None) or kwargs.pop("fpb", None)
+
if "sampling_rate" in kwargs or "sr" in kwargs:
- kwargs["sampling_rate"] = kwargs.pop(
- "sampling_rate", None
- ) or kwargs.pop("sr", None)
-
- if "sample_width" in kwargs or "sw" in kwargs:
- kwargs["sample_width"] = kwargs.pop(
- "sample_width", None
- ) or kwargs.pop("sw", None)
-
+ kwargs["sampling_rate"] = kwargs.pop("sampling_rate", None) or kwargs.pop("sr", None)
+
+ if "sample_width" in kwargs or "sw" in kwargs:
+ kwargs["sample_width"] = kwargs.pop("sample_width", None) or kwargs.pop("sw", None)
+
if "channels" in kwargs or "ch" in kwargs:
- kwargs["channels"] = kwargs.pop("channels", None) or kwargs.pop(
- "ch", None
- )
-
+ kwargs["channels"] = kwargs.pop("channels", None) or kwargs.pop("ch", None)
+
+
+
+
+
+
+
@staticmethod
def ads(**kwargs):
+
"""
- Create an return an :class:`AudioDataSource`. The type and
- behavior of the object is the result
- of the supplied parameters. Called without any parameters, the class
- will read audio data from the available built-in microphone with the
- default parameters.
-
- Parameters
- ----------
- sampling_rate, sr : int, default: 16000
- number of audio samples per second of input audio stream.
- sample_width, sw : int, default: 2
- number of bytes per sample, must be one of 1, 2 or 4
- channels, ch : int, default: 1
- number of audio channels, only a value of 1 is currently accepted.
- frames_per_buffer, fpb : int, default: 1024
- number of samples of PyAudio buffer.
- audio_source, asrc : `AudioSource`
- `AudioSource` to read data from
- filename, fn : str
- create an `AudioSource` object using this file
- data_buffer, db : str
- build an `io.BufferAudioSource` using data in `data_buffer`.
- If this keyword is used,
- `sampling_rate`, `sample_width` and `channels` are passed to
- `io.BufferAudioSource` constructor and used instead of default
- values.
- max_time, mt : float
- maximum time (in seconds) to read. Default behavior: read until
- there is no more data
- available.
- record, rec : bool, default = False
- save all read data in cache. Provide a navigable object which has a
- `rewind` method.
- block_dur, bd : float
- processing block duration in seconds. This represents the quantity
- of audio data to return each time the :func:`read` method is
- invoked. If `block_dur` is 0.025 (i.e. 25 ms) and the sampling rate
- is 8000 and the sample width is 2 bytes, :func:`read` returns a
- buffer of 0.025 * 8000 * 2 = 400 bytes at most. This parameter will
- be looked for (and used if available) before `block_size`. If
- neither parameter is given, `block_dur` will be set to 0.01 second
- (i.e. 10 ms)
- hop_dur, hd : float
- quantity of data to skip from current processing window. if
- `hop_dur` is supplied then there will be an overlap of `block_dur`
- - `hop_dur` between two adjacent blocks. This parameter will be
- looked for (and used if available) before `hop_size`.
- If neither parameter is given, `hop_dur` will be set to `block_dur`
- which means that there will be no overlap between two consecutively
- read blocks.
- block_size, bs : int
- number of samples to read each time the `read` method is called.
- Default: a block size that represents a window of 10ms, so for a
- sampling rate of 16000, the default `block_size` is 160 samples,
- for a rate of 44100, `block_size` = 441 samples, etc.
- hop_size, hs : int
- determines the number of overlapping samples between two adjacent
- read windows. For a `hop_size` of value *N*, the overlap is
- `block_size` - *N*. Default : `hop_size` = `block_size`, means that
- there is no overlap.
-
- Returns
- -------
- audio_data_source : AudioDataSource
- an `AudioDataSource` object build with input parameters.
+ Create an return an :class:`ADSFactory.AudioDataSource`. The type and behavior of the object is the result
+ of the supplied parameters.
+
+ :Parameters:
+
+ *No parameters* :
+ read audio data from the available built-in microphone with the default parameters.
+ The returned :class:`ADSFactory.AudioDataSource` encapsulate an :class:`io.PyAudioSource` object and hence
+ it accepts the next four parameters are passed to use instead of their default values.
+
+ `sampling_rate`, `sr` : *(int)*
+ number of samples per second. Default = 16000.
+
+ `sample_width`, `sw` : *(int)*
+ number of bytes per sample (must be in (1, 2, 4)). Default = 2
+
+ `channels`, `ch` : *(int)*
+ number of audio channels. Default = 1 (only this value is currently accepted)
+
+ `frames_per_buffer`, `fpb` : *(int)*
+ number of samples of PyAudio buffer. Default = 1024.
+
+ `audio_source`, `asrc` : an `AudioSource` object
+ read data from this audio source
+
+ `filename`, `fn` : *(string)*
+ build an `io.AudioSource` object using this file (currently only wave format is supported)
+
+ `data_buffer`, `db` : *(string)*
+ build an `io.BufferAudioSource` using data in `data_buffer`. If this keyword is used,
+ `sampling_rate`, `sample_width` and `channels` are passed to `io.BufferAudioSource`
+ constructor and used instead of default values.
+
+ `max_time`, `mt` : *(float)*
+ maximum time (in seconds) to read. Default behavior: read until there is no more data
+ available.
+
+ `record`, `rec` : *(bool)*
+ save all read data in cache. Provide a navigable object which boasts a `rewind` method.
+ Default = False.
+
+ `block_dur`, `bd` : *(float)*
+ processing block duration in seconds. This represents the quantity of audio data to return
+ each time the :func:`read` method is invoked. If `block_dur` is 0.025 (i.e. 25 ms) and the sampling
+ rate is 8000 and the sample width is 2 bytes, :func:`read` returns a buffer of 0.025 * 8000 * 2 = 400
+ bytes at most. This parameter will be looked for (and used if available) before `block_size`.
+ If neither parameter is given, `block_dur` will be set to 0.01 second (i.e. 10 ms)
+
+
+ `hop_dur`, `hd` : *(float)*
+ quantity of data to skip from current processing window. if `hop_dur` is supplied then there
+ will be an overlap of `block_dur` - `hop_dur` between two adjacent blocks. This
+ parameter will be looked for (and used if available) before `hop_size`. If neither parameter
+ is given, `hop_dur` will be set to `block_dur` which means that there will be no overlap
+ between two consecutively read blocks.
+
+ `block_size`, `bs` : *(int)*
+ number of samples to read each time the `read` method is called. Default: a block size
+ that represents a window of 10ms, so for a sampling rate of 16000, the default `block_size`
+ is 160 samples, for a rate of 44100, `block_size` = 441 samples, etc.
+
+ `hop_size`, `hs` : *(int)*
+ determines the number of overlapping samples between two adjacent read windows. For a
+ `hop_size` of value *N*, the overlap is `block_size` - *N*. Default : `hop_size` = `block_size`,
+ means that there is no overlap.
+
+ :Returns:
+
+ An AudioDataSource object that has the desired features.
+
+ :Exampels:
+
+ 1. **Create an AudioDataSource that reads data from the microphone (requires Pyaudio) with default audio parameters:**
+
+ .. code:: python
+
+ from auditok import ADSFactory
+ ads = ADSFactory.ads()
+ ads.get_sampling_rate()
+ 16000
+ ads.get_sample_width()
+ 2
+ ads.get_channels()
+ 1
+
+
+ 2. **Create an AudioDataSource that reads data from the microphone with a sampling rate of 48KHz:**
+
+ .. code:: python
+
+ from auditok import ADSFactory
+ ads = ADSFactory.ads(sr=48000)
+ ads.get_sampling_rate()
+ 48000
+
+ 3. **Create an AudioDataSource that reads data from a wave file:**
+
+ .. code:: python
+
+ import auditok
+ from auditok import ADSFactory
+ ads = ADSFactory.ads(fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence)
+ ads.get_sampling_rate()
+ 44100
+ ads.get_sample_width()
+ 2
+ ads.get_channels()
+ 1
+
+ 4. **Define size of read blocks as 20 ms**
+
+ .. code:: python
+
+ import auditok
+ from auditok import ADSFactory
+ '''
+ we know samling rate for previous file is 44100 samples/second
+ so 10 ms are equivalent to 441 samples and 20 ms to 882
+ '''
+ block_size = 882
+ ads = ADSFactory.ads(bs = 882, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence)
+ ads.open()
+ # read one block
+ data = ads.read()
+ ads.close()
+ len(data)
+ 1764
+ assert len(data) == ads.get_sample_width() * block_size
+
+ 5. **Define block size as a duration (use block_dur or bd):**
+
+ .. code:: python
+
+ import auditok
+ from auditok import ADSFactory
+ dur = 0.25 # second
+ ads = ADSFactory.ads(bd = dur, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence)
+ '''
+ we know samling rate for previous file is 44100 samples/second
+ for a block duration of 250 ms, block size should be 0.25 * 44100 = 11025
+ '''
+ ads.get_block_size()
+ 11025
+ assert ads.get_block_size() == int(0.25 * 44100)
+ ads.open()
+ # read one block
+ data = ads.read()
+ ads.close()
+ len(data)
+ 22050
+ assert len(data) == ads.get_sample_width() * ads.get_block_size()
+
+ 6. **Read overlapping blocks (one of hope_size, hs, hop_dur or hd > 0):**
+
+ For better readability we'd better use :class:`auditok.io.BufferAudioSource` with a string buffer:
+
+ .. code:: python
+
+ import auditok
+ from auditok import ADSFactory
+ '''
+ we supply a data beffer instead of a file (keyword 'bata_buffer' or 'db')
+ sr : sampling rate = 16 samples/sec
+ sw : sample width = 1 byte
+ ch : channels = 1
+ '''
+ buffer = "abcdefghijklmnop" # 16 bytes = 1 second of data
+ bd = 0.250 # block duration = 250 ms = 4 bytes
+ hd = 0.125 # hop duration = 125 ms = 2 bytes
+ ads = ADSFactory.ads(db = "abcdefghijklmnop", bd = bd, hd = hd, sr = 16, sw = 1, ch = 1)
+ ads.open()
+ ads.read()
+ 'abcd'
+ ads.read()
+ 'cdef'
+ ads.read()
+ 'efgh'
+ ads.read()
+ 'ghij'
+ data = ads.read()
+ assert data == 'ijkl'
+
+ 7. **Limit amount of read data (use max_time or mt):**
+
+ .. code:: python
+
+ '''
+ We know audio file is larger than 2.25 seconds
+ We want to read up to 2.25 seconds of audio data
+ '''
+ ads = ADSFactory.ads(mt = 2.25, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence)
+ ads.open()
+ data = []
+ while True:
+ d = ads.read()
+ if d is None:
+ break
+ data.append(d)
+
+ ads.close()
+ data = b''.join(data)
+ assert len(data) == int(ads.get_sampling_rate() * 2.25 * ads.get_sample_width() * ads.get_channels())
"""
- warnings.warn(
- "'ADSFactory' is deprecated and will be removed in a future "
- "release. Please use AudioReader class instead.",
- DeprecationWarning,
- )
-
+
+ # copy user's dicionary (shallow copy)
+ kwargs = kwargs.copy()
+
# check and normalize keyword arguments
ADSFactory._check_normalize_args(kwargs)
-
+
block_dur = kwargs.pop("bd")
hop_dur = kwargs.pop("hd")
block_size = kwargs.pop("bs")
@@ -628,483 +452,431 @@ class ADSFactory:
filename = kwargs.pop("fn")
data_buffer = kwargs.pop("db")
record = kwargs.pop("rec")
-
+
# Case 1: an audio source is supplied
if audio_source is not None:
if (filename, data_buffer) != (None, None):
- raise Warning(
- "You should provide one of 'audio_source', 'filename' or \
- 'data_buffer' keyword parameters. 'audio_source' will be \
- used"
- )
-
+ raise Warning("You should provide one of 'audio_source', 'filename' or 'data_buffer'\
+ keyword parameters. 'audio_source' will be used")
+
# Case 2: a file name is supplied
elif filename is not None:
if data_buffer is not None:
- raise Warning(
- "You should provide one of 'filename' or 'data_buffer'\
- keyword parameters. 'filename' will be used"
- )
+ raise Warning("You should provide one of 'filename' or 'data_buffer'\
+ keyword parameters. 'filename' will be used")
audio_source = from_file(filename)
-
- # Case 3: a data_buffer is supplied
+
+ # Case 3: a data_buffer is supplied
elif data_buffer is not None:
- audio_source = BufferAudioSource(data=data_buffer, **kwargs)
-
+ audio_source = BufferAudioSource(data_buffer = data_buffer, **kwargs)
+
# Case 4: try to access native audio input
else:
audio_source = PyAudioSource(**kwargs)
-
+
+
if block_dur is not None:
if block_size is not None:
- raise DuplicateArgument(
- "Either 'block_dur' or 'block_size' can be specified, not \
- both"
- )
- elif block_size is not None:
- block_dur = block_size / audio_source.sr
- else:
- block_dur = 0.01 # 10 ms
-
+ raise DuplicateArgument("Either 'block_dur' or 'block_size' can be specified, not both")
+ else:
+ block_size = int(audio_source.get_sampling_rate() * block_dur)
+ elif block_size is None:
+ # Set default block_size to 10 ms
+ block_size = int(audio_source.get_sampling_rate() / 100)
+
+ # Instantiate base AudioDataSource
+ ads = ADSFactory.AudioDataSource(audio_source=audio_source, block_size=block_size)
+
+ # Limit data to be read
+ if max_time is not None:
+ ads = ADSFactory.LimiterADS(ads=ads, max_time=max_time)
+
+ # Record, rewind and reuse data
+ if record:
+ ads = ADSFactory.RecorderADS(ads=ads)
+
# Read overlapping blocks of data
if hop_dur is not None:
if hop_size is not None:
- raise DuplicateArgument(
- "Either 'hop_dur' or 'hop_size' can be specified, not both"
- )
- elif hop_size is not None:
- hop_dur = hop_size / audio_source.sr
-
- ads = AudioDataSource(
- audio_source,
- block_dur=block_dur,
- hop_dur=hop_dur,
- record=record,
- max_read=max_time,
- )
+ raise DuplicateArgument("Either 'hop_dur' or 'hop_size' can be specified, not both")
+ else:
+ hop_size = int(audio_source.get_sampling_rate() * hop_dur)
+
+ if hop_size is not None:
+ if hop_size <= 0 or hop_size > block_size:
+ raise ValueError("hop_size must be > 0 and <= block_size")
+ if hop_size < block_size:
+ ads = ADSFactory.OverlapADS(ads=ads, hop_size=hop_size)
+
return ads
+
+
+ class AudioDataSource(DataSource):
+ """
+ Base class for AudioDataSource objects.
+ It inherits from DataSource and encapsulates an AudioSource object.
+ """
+
+ def __init__(self, audio_source, block_size):
+
+ self.audio_source = audio_source
+ self.block_size = block_size
+
+ def get_block_size(self):
+ return self.block_size
+
+ def set_block_size(self, size):
+ self.block_size = size
+
+ def get_audio_source(self):
+ return self.audio_source
+
+ def set_audio_source(self, audio_source):
+ self.audio_source = audio_source
+
+ def open(self):
+ self.audio_source.open()
+
+ def close(self):
+ self.audio_source.close()
+
+ def is_open(self):
+ return self.audio_source.is_open()
+
+ def get_sampling_rate(self):
+ return self.audio_source.get_sampling_rate()
+
+ def get_sample_width(self):
+ return self.audio_source.get_sample_width()
+
+ def get_channels(self):
+ return self.audio_source.get_channels()
+
+
+ def rewind(self):
+ if isinstance(self.audio_source, Rewindable):
+ self.audio_source.rewind()
+ else:
+ raise Exception("Audio source is not rewindable")
+
+
+
+ def is_rewindable(self):
+ return isinstance(self.audio_source, Rewindable)
+
+
+ def read(self):
+ return self.audio_source.read(self.block_size)
+
+
+ class ADSDecorator(AudioDataSource):
+ """
+ Base decorator class for AudioDataSource objects.
+ """
+ __metaclass__ = ABCMeta
+
+ def __init__(self, ads):
+ self.ads = ads
+
+ self.get_block_size = self.ads.get_block_size
+ self.set_block_size = self.ads.set_block_size
+ self.get_audio_source = self.ads.get_audio_source
+ self.open = self.ads.open
+ self.close = self.ads.close
+ self.is_open = self.ads.is_open
+ self.get_sampling_rate = self.ads.get_sampling_rate
+ self.get_sample_width = self.ads.get_sample_width
+ self.get_channels = self.ads.get_channels
+
+ def is_rewindable(self):
+ return self.ads.is_rewindable
+
+ def rewind(self):
+ self.ads.rewind()
+ self._reinit()
+
+ def set_audio_source(self, audio_source):
+ self.ads.set_audio_source(audio_source)
+ self._reinit()
+
+ def open(self):
+ if not self.ads.is_open():
+ self.ads.open()
+ self._reinit()
+
+ @abstractmethod
+ def _reinit(self):
+ pass
+
+
+ class OverlapADS(ADSDecorator):
+ """
+ A class for AudioDataSource objects that can read and return overlapping audio frames
+ """
+
+ def __init__(self, ads, hop_size):
+ ADSFactory.ADSDecorator.__init__(self, ads)
+
+ if hop_size <= 0 or hop_size > self.get_block_size():
+ raise ValueError("hop_size must be either 'None' or \
+ between 1 and block_size (both inclusive)")
+ self.hop_size = hop_size
+ self._actual_block_size = self.get_block_size()
+ self._reinit()
+
+
+ def _get_block_size():
+ return self._actual_block_size
+
+
+ def _read_first_block(self):
+ # For the first call, we need an entire block of size 'block_size'
+ block = self.ads.read()
+ if block is None:
+ return None
+
+ # Keep a slice of data in cache and append it in the next call
+ if len(block) > self._hop_size_bytes:
+ self._cache = block[self._hop_size_bytes:]
+
+ # Up from the next call, we will use '_read_next_blocks'
+ # and we only read 'hop_size'
+ self.ads.set_block_size(self.hop_size)
+ self.read = self._read_next_blocks
+
+ return block
+
+ def _read_next_blocks(self):
+ block = self.ads.read()
+ if block is None:
+ return None
+
+ # Append block to cache data to ensure overlap
+ block = self._cache + block
+ # Keep a slice of data in cache only if we have a full length block
+ # if we don't that means that this is the last block
+ if len(block) == self._block_size_bytes:
+ self._cache = block[self._hop_size_bytes:]
+ else:
+ self._cache = None
+
+ return block
-
-class _AudioReadingProxy:
- def __init__(self, audio_source):
-
- self._audio_source = audio_source
-
- def rewind(self):
- if self.rewindable:
- self._audio_source.rewind()
- else:
- raise AudioIOError("Audio stream is not rewindable")
-
- def rewindable(self):
- try:
- return self._audio_source.rewindable
- except AttributeError:
- return False
-
- def is_open(self):
- return self._audio_source.is_open()
-
- def open(self):
- self._audio_source.open()
-
- def close(self):
- self._audio_source.close()
-
- def read(self, size):
- return self._audio_source.read(size)
-
- @property
- def data(self):
- err_msg = "This AudioReader is not a recorder, no recorded data can "
- err_msg += "be retrieved"
- raise AttributeError(err_msg)
-
- def __getattr__(self, name):
- return getattr(self._audio_source, name)
-
-
-class _Recorder(_AudioReadingProxy):
- """
- Class for `AudioReader` objects that can record all data they read. Useful
- when reading data from microphone.
- """
-
- def __init__(self, audio_source):
- super(_Recorder, self).__init__(audio_source)
- self._cache = []
- self._read_block = self._read_and_cache
- self._read_from_cache = False
- self._data = None
-
- def read(self, size):
- return self._read_block(size)
-
- @property
- def data(self):
- if self._data is None:
- err_msg = "Unrewinded recorder. `rewind` should be called before "
- err_msg += "accessing recorded data"
- raise RuntimeError(err_msg)
- return self._data
-
- def rewindable(self):
- return True
-
- def rewind(self):
- if self._read_from_cache:
- self._audio_source.rewind()
- else:
- self._data = b"".join(self._cache)
+ def read(self):
+ pass
+
+ def _reinit(self):
self._cache = None
- self._audio_source = BufferAudioSource(
- self._data, self.sr, self.sw, self.ch
- )
- self._read_block = self._audio_source.read
- self.open()
- self._read_from_cache = True
-
- def _read_and_cache(self, size):
- # Read and save read data
- block = self._audio_source.read(size)
- if block is not None:
- self._cache.append(block)
- return block
-
-
-class _Limiter(_AudioReadingProxy):
- """
- Class for `AudioReader` objects that can read a fixed amount of data.
- This can be useful when reading data from the microphone or from large
- audio files.
- """
-
- def __init__(self, audio_source, max_read):
- super(_Limiter, self).__init__(audio_source)
- self._max_read = max_read
- self._max_samples = round(max_read * self.sr)
- self._bytes_per_sample = self.sw * self.ch
- self._read_samples = 0
-
- @property
- def data(self):
- data = self._audio_source.data
- max_read_bytes = self._max_samples * self._bytes_per_sample
- return data[:max_read_bytes]
-
- @property
- def max_read(self):
- return self._max_read
-
- def read(self, size):
- size = min(self._max_samples - self._read_samples, size)
- if size <= 0:
- return None
- block = self._audio_source.read(size)
- if block is None:
- return None
- self._read_samples += len(block) // self._bytes_per_sample
- return block
-
- def rewind(self):
- super(_Limiter, self).rewind()
- self._read_samples = 0
-
-
-class _FixedSizeAudioReader(_AudioReadingProxy):
- """
- Class to read fixed-size audio windows from source.
- """
-
- def __init__(self, audio_source, block_dur):
- super(_FixedSizeAudioReader, self).__init__(audio_source)
-
- if block_dur <= 0:
- raise ValueError(
- "block_dur must be > 0, given: {}".format(block_dur)
- )
-
- self._block_size = int(block_dur * self.sr)
- if self._block_size == 0:
- err_msg = "Too small block_dur ({0:f}) for sampling rate ({1}). "
- err_msg += "block_dur should cover at least one sample "
- err_msg += "(i.e. 1/{1})"
- raise TooSamllBlockDuration(
- err_msg.format(block_dur, self.sr), block_dur, self.sr
- )
-
- def read(self):
- return self._audio_source.read(self._block_size)
-
- @property
- def block_size(self):
- return self._block_size
-
- @property
- def block_dur(self):
- return self._block_size / self.sr
-
- def __getattr__(self, name):
- return getattr(self._audio_source, name)
-
-
-class _OverlapAudioReader(_FixedSizeAudioReader):
- """
- Class for `AudioReader` objects that can read and return overlapping audio
- windows.
- """
+ self.ads.set_block_size(self._actual_block_size)
+ self._hop_size_bytes = self.hop_size * \
+ self.get_sample_width() * \
+ self.get_channels()
+ self._block_size_bytes = self.get_block_size() * \
+ self.get_sample_width() * \
+ self.get_channels()
+ self.read = self._read_first_block
- def __init__(self, audio_source, block_dur, hop_dur):
- if hop_dur >= block_dur:
- raise ValueError('"hop_dur" should be < "block_dur"')
- super(_OverlapAudioReader, self).__init__(audio_source, block_dur)
-
- self._hop_size = int(hop_dur * self.sr)
- self._blocks = self._iter_blocks_with_overlap()
-
- def _iter_blocks_with_overlap(self):
- while not self.is_open():
- yield AudioIOError
- block = self._audio_source.read(self._block_size)
- if block is None:
- yield None
-
- _hop_size_bytes = (
- self._hop_size * self._audio_source.sw * self._audio_source.ch
- )
- cache = block[_hop_size_bytes:]
- yield block
-
- while True:
- block = self._audio_source.read(self._hop_size)
- if block:
- block = cache + block
- cache = block[_hop_size_bytes:]
- yield block
- continue
- yield None
-
- def read(self):
- try:
- block = next(self._blocks)
- if block == AudioIOError:
- raise AudioIOError("Audio Stream is not open.")
+ class LimiterADS(ADSDecorator):
+ """
+ A class for AudioDataSource objects that can read a fixed amount of data.
+ This can be useful when reading data from the microphone or from large audio files.
+ """
+
+ def __init__(self, ads, max_time):
+ ADSFactory.ADSDecorator.__init__(self, ads)
+
+ self.max_time = max_time
+ self._reinit()
+
+ def read(self):
+ if self._total_read_bytes >= self._max_read_bytes:
+ return None
+ block = self.ads.read()
+ if block is None:
+ return None
+ self._total_read_bytes += len(block)
+
+ if self._total_read_bytes >= self._max_read_bytes:
+ self.close()
+
return block
- except StopIteration:
- return None
-
- def rewind(self):
- super(_OverlapAudioReader, self).rewind()
- self._blocks = self._iter_blocks_with_overlap()
-
- @property
- def hop_size(self):
- return self._hop_size
-
- @property
- def hop_dur(self):
- return self._hop_size / self.sr
-
- def __getattr__(self, name):
- return getattr(self._audio_source, name)
-
+
+
+ def _reinit(self):
+ self._max_read_bytes = int(self.max_time * self.get_sampling_rate()) * \
+ self.get_sample_width() * \
+ self.get_channels()
+ self._total_read_bytes = 0
-class AudioReader(DataSource):
- """
- Class to read fixed-size chunks of audio data from a source. A source can
- be a file on disk, standard input (with `input` = "-") or microphone. This
- is normally used by tokenization algorithms that expect source objects with
- a `read` function that returns a windows of data of the same size at each
- call expect when remaining data does not make up a full window.
-
- Objects of this class can be set up to return audio windows with a given
- overlap and to record the whole stream for later access (useful when
- reading data from the microphone). They can also have
- a limit for the maximum amount of data to read.
-
- Parameters
- ----------
- input : str, bytes, AudioSource, AudioReader, AudioRegion or None
- input audio data. If the type of the passed argument is `str`, it should
- be a path to an existing audio file. "-" is interpreted as standardinput.
- If the type is `bytes`, input is considered as a buffer of raw audio
- data. If None, read audio from microphone. Every object that is not an
- :class:`AudioReader` will be transformed, when possible, into an
- :class:`AudioSource` before processing. If it is an `str` that refers to
- a raw audio file, `bytes` or None, audio parameters should be provided
- using kwargs (i.e., `samplig_rate`, `sample_width` and `channels` or
- their alias).
- block_dur: float, default: 0.01
- length in seconds of audio windows to return at each `read` call.
- hop_dur: float, default: None
- length in seconds of data amount to skip from previous window. If
- defined, it is used to compute the temporal overlap between previous and
- current window (nameply `overlap = block_dur - hop_dur`). Default, None,
- means that consecutive windows do not overlap.
- record: bool, default: False
- whether to record read audio data for later access. If True, audio data
- can be retrieved by first calling `rewind()`, then using the `data`
- property. Note that once `rewind()` is called, no new data will be read
- from source (subsequent `read()` call will read data from cache) and
- that there's no need to call `rewind()` again to access `data` property.
- max_read: float, default: None
- maximum amount of audio data to read in seconds. Default is None meaning
- that data will be read until end of stream is reached or, when reading
- from microphone a Ctrl-C is sent.
-
- When `input` is None, of type bytes or a raw audio files some of the
- follwing kwargs are mandatory.
+
- Other Parameters
- ----------------
- audio_format, fmt : str
- type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be
- used if `input` is a string path to an audio file. If not given, audio
- type will be guessed from file name extension or from file header.
- sampling_rate, sr : int
- sampling rate of audio data. Required if `input` is a raw audio file, is
- a bytes object or None (i.e., read from microphone).
- sample_width, sw : int
- number of bytes used to encode one audio sample, typically 1, 2 or 4.
- Required for raw data, see `sampling_rate`.
- channels, ch : int
- number of channels of audio data. Required for raw data, see
- `sampling_rate`.
- use_channel, uc : {None, "any", "mix", "avg", "average"} or int
- which channel to use for split if `input` has multiple audio channels.
- Regardless of which channel is used for splitting, returned audio events
- contain data from *all* the channels of `input`. The following values
- are accepted:
-
- - None (alias "any"): accept audio activity from any channel, even if
- other channels are silent. This is the default behavior.
-
- - "mix" (alias "avg" or "average"): mix down all channels (i.e., compute
- average channel) and split the resulting channel.
+ class RecorderADS(ADSDecorator):
+ """
+ A class for AudioDataSource objects that can record all audio data they read,
+ with a rewind facility.
+ """
+
+ def __init__(self, ads):
+ ADSFactory.ADSDecorator.__init__(self, ads)
+
+ self._reinit()
+
+ def read(self):
+ pass
+
+ def _read_and_rec(self):
+ # Read and save read data
+ block = self.ads.read()
+ if block is not None:
+ self._cache.append(block)
+
+ return block
+
+
+ def _read_simple(self):
+ # Read without recording
+ return self.ads.read()
+
+ def rewind(self):
+ if self._record:
+ # If has been recording, create a new BufferAudioSource
+ # from recorded data
+ dbuffer = self._concatenate(self._cache)
+ asource = BufferAudioSource(dbuffer, self.get_sampling_rate(),
+ self.get_sample_width(),
+ self.get_channels())
+
+
+ self.set_audio_source(asource)
+ self.open()
+ self._cache = []
+ self._record = False
+ self.read = self._read_simple
+
+ else:
+ self.ads.rewind()
+ if not self.is_open():
+ self.open()
+
+
+ def is_rewindable(self):
+ return True
+
+ def _reinit(self):
+ # when audio_source is replaced, start recording again
+ self._record = True
+ self._cache = []
+ self.read = self._read_and_rec
+
+ def _concatenate(self, data):
+ try:
+ # should always work for python 2
+ # work for python 3 ONLY if data is a list (or an iterator)
+ # whose each element is a 'bytes' objects
+ return b''.join(data)
+ except TypeError:
+ # work for 'str' in python 2 and python 3
+ return ''.join(data)
- - int (>= 0 , < `channels`): use one channel, specified by its integer
- id, for split.
- large_file : bool, default: False
- If True, AND if `input` is a path to a *wav* of a *raw* audio file
- (and only these two formats) then audio data is lazily loaded to memory
- (i.e., one analysis window a time). Otherwise the whole file is loaded
- to memory before split. Set to True if the size of the file is larger
- than available memory.
+class AudioEnergyValidator(DataValidator):
"""
-
- def __init__(
- self,
- input,
- block_dur=0.01,
- hop_dur=None,
- record=False,
- max_read=None,
- **kwargs
- ):
- if not isinstance(input, AudioSource):
- input = get_audio_source(input, **kwargs)
- self._record = record
- if record:
- input = _Recorder(input)
- if max_read is not None:
- input = _Limiter(input, max_read)
- self._max_read = max_read
- if hop_dur is not None:
- input = _OverlapAudioReader(input, block_dur, hop_dur)
- else:
- input = _FixedSizeAudioReader(input, block_dur)
- self._audio_source = input
-
- def __repr__(self):
- block_dur, hop_dur, max_read = None, None, None
- if self.block_dur is not None:
- block_dur = "{:.3f}".format(self.block_dur)
- if self.hop_dur is not None:
- hop_dur = "{:.3f}".format(self.hop_dur)
- if self.max_read is not None:
- max_read = "{:.3f}".format(self.max_read)
- return (
- "{cls}(block_dur={block_dur}, "
- "hop_dur={hop_dur}, record={rewindable}, "
- "max_read={max_read})"
- ).format(
- cls=self.__class__.__name__,
- block_dur=block_dur,
- hop_dur=hop_dur,
- rewindable=self._record,
- max_read=max_read,
- )
-
- @property
- def rewindable(self):
- return self._record
-
- @property
- def block_dur(self):
- return self._audio_source.block_size / self._audio_source.sr
-
- @property
- def hop_dur(self):
- if hasattr(self._audio_source, "hop_dur"):
- return self._audio_source.hop_size / self._audio_source.sr
- return self.block_dur
-
- @property
- def hop_size(self):
- if hasattr(self._audio_source, "hop_size"):
- return self._audio_source.hop_size
- return self.block_size
-
- @property
- def max_read(self):
- try:
- return self._audio_source.max_read
- except AttributeError:
- return None
-
- def read(self):
- return self._audio_source.read()
-
- def __getattr__(self, name):
- if name in ("data", "rewind") and not self.rewindable:
- raise AttributeError(
- "'AudioReader' has no attribute '{}'".format(name)
- )
- try:
- return getattr(self._audio_source, name)
- except AttributeError:
- raise AttributeError(
- "'AudioReader' has no attribute '{}'".format(name)
- )
-
-
-# Keep AudioDataSource for compatibility
-# Remove in a future version when ADSFactory is removed
-AudioDataSource = AudioReader
-
-
-class Recorder(AudioReader):
- """Class to read fixed-size chunks of audio data from a source and keeps
- data in a cache. Using this class is equivalent to initializing
- :class:`AudioReader` with `record=True`. For more information about the
- other parameters see :class:`AudioReader`.
-
- Once the desired amount of data is read, you can call the :func:`rewind`
- method then get the recorded data via the :attr:`data` attribute. You can also
- re-read cached data one window a time by calling :func:`read`.
+ The most basic auditok audio frame validator.
+ This validator computes the log energy of an input audio frame
+ and return True if the result is >= a given threshold, False
+ otherwise.
+
+ :Parameters:
+
+ `sample_width` : *(int)*
+ Number of bytes of one audio sample. This is used to convert data from `basestring` or `Bytes` to
+ an array of floats.
+
+ `energy_threshold` : *(float)*
+ A threshold used to check whether an input data buffer is valid.
"""
+
+
+ if _WITH_NUMPY:
+
+ _formats = {1: numpy.int8 , 2: numpy.int16, 4: numpy.int32}
+
+ @staticmethod
+ def _convert(signal, sample_width):
+ return numpy.array(numpy.frombuffer(signal, dtype=AudioEnergyValidator._formats[sample_width]), dtype=numpy.float64)
+
+ @staticmethod
+ def _signal_energy(signal):
+ return float(numpy.dot(signal, signal)) / len(signal)
+
+ @staticmethod
+ def _signal_log_energy(signal):
+ energy = AudioEnergyValidator._signal_energy(signal)
+ if energy <= 0:
+ return -200
+ return 10. * numpy.log10(energy)
+
+ else:
+
+
+ _formats = {1: 'b' , 2: 'h', 4: 'i'}
+
+ @staticmethod
+ def _convert(signal, sample_width):
+ return array("d", array(AudioEnergyValidator._formats[sample_width], signal))
+
+ @staticmethod
+ def _signal_energy(signal):
+ energy = 0.
+ for a in signal:
+ energy += a * a
+ return energy / len(signal)
+
+ @staticmethod
+ def _signal_log_energy(signal):
+ energy = AudioEnergyValidator._signal_energy(signal)
+ if energy <= 0:
+ return -200
+ return 10. * math.log10(energy)
+
+
+ def __init__(self, sample_width, energy_threshold=45):
+ self.sample_width = sample_width
+ self._energy_threshold = energy_threshold
+
+
+ def is_valid(self, data):
+ """
+ Check if data is valid. Audio data will be converted into an array (of
+ signed values) of which the log energy is computed. Log energy is computed
+ as follows:
+
+ .. code:: python
+
+ arr = AudioEnergyValidator._convert(signal, sample_width)
+ energy = float(numpy.dot(arr, arr)) / len(arr)
+ log_energy = 10. * numpy.log10(energy)
+
+
+ :Parameters:
+
+ `data` : either a *string* or a *Bytes* buffer
+ `data` is converted into a numerical array using the `sample_width`
+ given in the constructor.
+
+ :Retruns:
+
+ True if `log_energy` >= `energy_threshold`, False otherwise.
+ """
+
+ signal = AudioEnergyValidator._convert(data, self.sample_width)
+ return AudioEnergyValidator._signal_log_energy(signal) >= self._energy_threshold
+
+ def get_energy_threshold(self):
+ return self._energy_threshold
+
+ def set_energy_threshold(self, threshold):
+ self._energy_threshold = threshold
- def __init__(
- self, input, block_dur=0.01, hop_dur=None, max_read=None, **kwargs
- ):
- super().__init__(
- input,
- block_dur=block_dur,
- hop_dur=hop_dur,
- record=True,
- max_read=max_read,
- **kwargs
- )
diff --git a/libs/auditok/workers.py b/libs/auditok/workers.py
deleted file mode 100644
index 314a92004..000000000
--- a/libs/auditok/workers.py
+++ /dev/null
@@ -1,427 +0,0 @@
-import os
-import sys
-from tempfile import NamedTemporaryFile
-from abc import ABCMeta, abstractmethod
-from threading import Thread
-from datetime import datetime, timedelta
-from collections import namedtuple
-import wave
-import subprocess
-from queue import Queue, Empty
-from .io import _guess_audio_format
-from .util import AudioDataSource, make_duration_formatter
-from .core import split
-from .exceptions import (
- EndOfProcessing,
- AudioEncodingError,
- AudioEncodingWarning,
-)
-
-
-_STOP_PROCESSING = "STOP_PROCESSING"
-_Detection = namedtuple("_Detection", "id start end duration")
-
-
-def _run_subprocess(command):
- try:
- with subprocess.Popen(
- command,
- stdin=open(os.devnull, "rb"),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- ) as proc:
- stdout, stderr = proc.communicate()
- return proc.returncode, stdout, stderr
- except Exception:
- err_msg = "Couldn't export audio using command: '{}'".format(command)
- raise AudioEncodingError(err_msg)
-
-
-class Worker(Thread, metaclass=ABCMeta):
- def __init__(self, timeout=0.5, logger=None):
- self._timeout = timeout
- self._logger = logger
- self._inbox = Queue()
- Thread.__init__(self)
-
- def run(self):
- while True:
- message = self._get_message()
- if message == _STOP_PROCESSING:
- break
- if message is not None:
- self._process_message(message)
- self._post_process()
-
- @abstractmethod
- def _process_message(self, message):
- """Process incoming messages"""
-
- def _post_process(self):
- pass
-
- def _log(self, message):
- self._logger.info(message)
-
- def _stop_requested(self):
- try:
- message = self._inbox.get_nowait()
- if message == _STOP_PROCESSING:
- return True
- except Empty:
- return False
-
- def stop(self):
- self.send(_STOP_PROCESSING)
- self.join()
-
- def send(self, message):
- self._inbox.put(message)
-
- def _get_message(self):
- try:
- message = self._inbox.get(timeout=self._timeout)
- return message
- except Empty:
- return None
-
-
-class TokenizerWorker(Worker, AudioDataSource):
- def __init__(self, reader, observers=None, logger=None, **kwargs):
- self._observers = observers if observers is not None else []
- self._reader = reader
- self._audio_region_gen = split(self, **kwargs)
- self._detections = []
- self._log_format = "[DET]: Detection {0.id} (start: {0.start:.3f}, "
- self._log_format += "end: {0.end:.3f}, duration: {0.duration:.3f})"
- Worker.__init__(self, timeout=0.2, logger=logger)
-
- def _process_message(self):
- pass
-
- @property
- def detections(self):
- return self._detections
-
- def _notify_observers(self, message):
- for observer in self._observers:
- observer.send(message)
-
- def run(self):
- self._reader.open()
- start_processing_timestamp = datetime.now()
- for _id, audio_region in enumerate(self._audio_region_gen, start=1):
- timestamp = start_processing_timestamp + timedelta(
- seconds=audio_region.meta.start
- )
- audio_region.meta.timestamp = timestamp
- detection = _Detection(
- _id,
- audio_region.meta.start,
- audio_region.meta.end,
- audio_region.duration,
- )
- self._detections.append(detection)
- if self._logger is not None:
- message = self._log_format.format(detection)
- self._log(message)
- self._notify_observers((_id, audio_region))
- self._notify_observers(_STOP_PROCESSING)
- self._reader.close()
-
- def start_all(self):
- for observer in self._observers:
- observer.start()
- self.start()
-
- def stop_all(self):
- self.stop()
- for observer in self._observers:
- observer.stop()
- self._reader.close()
-
- def read(self):
- if self._stop_requested():
- return None
- else:
- return self._reader.read()
-
- def __getattr__(self, name):
- return getattr(self._reader, name)
-
-
-class StreamSaverWorker(Worker):
- def __init__(
- self,
- audio_reader,
- filename,
- export_format=None,
- cache_size_sec=0.5,
- timeout=0.2,
- ):
- self._reader = audio_reader
- sample_size_bytes = self._reader.sw * self._reader.ch
- self._cache_size = cache_size_sec * self._reader.sr * sample_size_bytes
- self._output_filename = filename
- self._export_format = _guess_audio_format(export_format, filename)
- if self._export_format is None:
- self._export_format = "wav"
- self._init_output_stream()
- self._exported = False
- self._cache = []
- self._total_cached = 0
- Worker.__init__(self, timeout=timeout)
-
- def _get_non_existent_filename(self):
- filename = self._output_filename + ".wav"
- i = 0
- while os.path.exists(filename):
- i += 1
- filename = self._output_filename + "({}).wav".format(i)
- return filename
-
- def _init_output_stream(self):
- if self._export_format != "wav":
- self._tmp_output_filename = self._get_non_existent_filename()
- else:
- self._tmp_output_filename = self._output_filename
- self._wfp = wave.open(self._tmp_output_filename, "wb")
- self._wfp.setframerate(self._reader.sr)
- self._wfp.setsampwidth(self._reader.sw)
- self._wfp.setnchannels(self._reader.ch)
-
- @property
- def sr(self):
- return self._reader.sampling_rate
-
- @property
- def sw(self):
- return self._reader.sample_width
-
- @property
- def ch(self):
- return self._reader.channels
-
- def __del__(self):
- self._post_process()
-
- if (
- (self._tmp_output_filename != self._output_filename)
- and self._exported
- and os.path.exists(self._tmp_output_filename)
- ):
- os.remove(self._tmp_output_filename)
-
- def _process_message(self, data):
- self._cache.append(data)
- self._total_cached += len(data)
- if self._total_cached >= self._cache_size:
- self._write_cached_data()
-
- def _post_process(self):
- while True:
- try:
- data = self._inbox.get_nowait()
- if data != _STOP_PROCESSING:
- self._cache.append(data)
- self._total_cached += len(data)
- except Empty:
- break
- self._write_cached_data()
- self._wfp.close()
-
- def _write_cached_data(self):
- if self._cache:
- data = b"".join(self._cache)
- self._wfp.writeframes(data)
- self._cache = []
- self._total_cached = 0
-
- def open(self):
- self._reader.open()
-
- def close(self):
- self._reader.close()
- self.stop()
-
- def rewind(self):
- # ensure compatibility with AudioDataSource with record=True
- pass
-
- @property
- def data(self):
- with wave.open(self._tmp_output_filename, "rb") as wfp:
- return wfp.readframes(-1)
-
- def save_stream(self):
- if self._exported:
- return self._output_filename
-
- if self._export_format in ("raw", "wav"):
- if self._export_format == "raw":
- self._export_raw()
- self._exported = True
- return self._output_filename
- try:
- self._export_with_ffmpeg_or_avconv()
- except AudioEncodingError:
- try:
- self._export_with_sox()
- except AudioEncodingError:
- warn_msg = "Couldn't save audio data in the desired format "
- warn_msg += "'{}'. Either none of 'ffmpeg', 'avconv' or 'sox' "
- warn_msg += "is installed or this format is not recognized.\n"
- warn_msg += "Audio file was saved as '{}'"
- raise AudioEncodingWarning(
- warn_msg.format(
- self._export_format, self._tmp_output_filename
- )
- )
- finally:
- self._exported = True
- return self._output_filename
-
- def _export_raw(self):
- with open(self._output_filename, "wb") as wfp:
- wfp.write(self.data)
-
- def _export_with_ffmpeg_or_avconv(self):
- command = [
- "-y",
- "-f",
- "wav",
- "-i",
- self._tmp_output_filename,
- "-f",
- self._export_format,
- self._output_filename,
- ]
- returncode, stdout, stderr = _run_subprocess(["ffmpeg"] + command)
- if returncode != 0:
- returncode, stdout, stderr = _run_subprocess(["avconv"] + command)
- if returncode != 0:
- raise AudioEncodingError(stderr)
- return stdout, stderr
-
- def _export_with_sox(self):
- command = [
- "sox",
- "-t",
- "wav",
- self._tmp_output_filename,
- self._output_filename,
- ]
- returncode, stdout, stderr = _run_subprocess(command)
- if returncode != 0:
- raise AudioEncodingError(stderr)
- return stdout, stderr
-
- def close_output(self):
- self._wfp.close()
-
- def read(self):
- data = self._reader.read()
- if data is not None:
- self.send(data)
- else:
- self.send(_STOP_PROCESSING)
- return data
-
- def __getattr__(self, name):
- if name == "data":
- return self.data
- return getattr(self._reader, name)
-
-
-class PlayerWorker(Worker):
- def __init__(self, player, progress_bar=False, timeout=0.2, logger=None):
- self._player = player
- self._progress_bar = progress_bar
- self._log_format = "[PLAY]: Detection {id} played"
- Worker.__init__(self, timeout=timeout, logger=logger)
-
- def _process_message(self, message):
- _id, audio_region = message
- if self._logger is not None:
- message = self._log_format.format(id=_id)
- self._log(message)
- audio_region.play(
- player=self._player, progress_bar=self._progress_bar, leave=False
- )
-
-
-class RegionSaverWorker(Worker):
- def __init__(
- self,
- filename_format,
- audio_format=None,
- timeout=0.2,
- logger=None,
- **audio_parameters
- ):
- self._filename_format = filename_format
- self._audio_format = audio_format
- self._audio_parameters = audio_parameters
- self._debug_format = "[SAVE]: Detection {id} saved as '{filename}'"
- Worker.__init__(self, timeout=timeout, logger=logger)
-
- def _process_message(self, message):
- _id, audio_region = message
- filename = self._filename_format.format(
- id=_id,
- start=audio_region.meta.start,
- end=audio_region.meta.end,
- duration=audio_region.duration,
- )
- filename = audio_region.save(
- filename, self._audio_format, **self._audio_parameters
- )
- if self._logger:
- message = self._debug_format.format(id=_id, filename=filename)
- self._log(message)
-
-
-class CommandLineWorker(Worker):
- def __init__(self, command, timeout=0.2, logger=None):
- self._command = command
- Worker.__init__(self, timeout=timeout, logger=logger)
- self._debug_format = "[COMMAND]: Detection {id} command: '{command}'"
-
- def _process_message(self, message):
- _id, audio_region = message
- with NamedTemporaryFile(delete=False) as file:
- filename = audio_region.save(file.name, audio_format="wav")
- command = self._command.format(file=filename)
- os.system(command)
- if self._logger is not None:
- message = self._debug_format.format(id=_id, command=command)
- self._log(message)
-
-
-class PrintWorker(Worker):
- def __init__(
- self,
- print_format="{start} {end}",
- time_format="%S",
- timestamp_format="%Y/%m/%d %H:%M:%S.%f",
- timeout=0.2,
- ):
-
- self._print_format = print_format
- self._format_time = make_duration_formatter(time_format)
- self._timestamp_format = timestamp_format
- self.detections = []
- Worker.__init__(self, timeout=timeout)
-
- def _process_message(self, message):
- _id, audio_region = message
- timestamp = audio_region.meta.timestamp
- timestamp = timestamp.strftime(self._timestamp_format)
- text = self._print_format.format(
- id=_id,
- start=self._format_time(audio_region.meta.start),
- end=self._format_time(audio_region.meta.end),
- duration=self._format_time(audio_region.duration),
- timestamp=timestamp,
- )
- print(text)
diff --git a/libs/version.txt b/libs/version.txt
index 6bd849547..b5f2fafae 100644
--- a/libs/version.txt
+++ b/libs/version.txt
@@ -34,7 +34,7 @@ urllib3=1.23
Waitress=1.4.3
## indirect dependencies
-auditok=0.2.0 # Required-by: ffsubsync
+auditok=0.1.5 # Required-by: ffsubsync
rich=10.1.0 # Required-by: ffsubsync
srt=3.4.1 # Required-by: ffsubsync