diff options
Diffstat (limited to 'libs/auditok')
-rw-r--r-- | libs/auditok/__init__.py | 19 | ||||
-rw-r--r-- | libs/auditok/cmdline.py | 794 | ||||
-rw-r--r-- | libs/auditok/core.py | 437 | ||||
-rw-r--r-- | libs/auditok/data/1to6arabic_16000_mono_bc_noise.wav | bin | 0 -> 601256 bytes | |||
-rw-r--r-- | libs/auditok/data/was_der_mensch_saet_das_wird_er_vielfach_ernten_44100Hz_mono_lead_trail_silence.wav | bin | 0 -> 1493036 bytes | |||
-rw-r--r-- | libs/auditok/dataset.py | 18 | ||||
-rw-r--r-- | libs/auditok/exceptions.py | 3 | ||||
-rw-r--r-- | libs/auditok/io.py | 517 | ||||
-rw-r--r-- | libs/auditok/util.py | 843 |
9 files changed, 2631 insertions, 0 deletions
diff --git a/libs/auditok/__init__.py b/libs/auditok/__init__.py new file mode 100644 index 000000000..4534c7c9c --- /dev/null +++ b/libs/auditok/__init__.py @@ -0,0 +1,19 @@ +""" +:author: + +Amine SEHILI <[email protected]> +2015-2018 + +:License: + +This package is published under GNU GPL Version 3. +""" + +from __future__ import absolute_import +from .core import * +from .io import * +from .util import * +from . import dataset +from .exceptions import * + +__version__ = "0.1.8" diff --git a/libs/auditok/cmdline.py b/libs/auditok/cmdline.py new file mode 100644 index 000000000..5878b0ccc --- /dev/null +++ b/libs/auditok/cmdline.py @@ -0,0 +1,794 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +auditok.auditok -- Audio Activity Detection tool + +auditok.auditok is a program that can be used for Audio/Acoustic activity detection. +It can read audio data from audio files as well as from built-in device(s) or standard input + + +@author: Mohamed El Amine SEHILI + +@copyright: 2015-2018 Mohamed El Amine SEHILI + +@license: GPL v3 + +@contact: [email protected] +@deffield updated: 01 Nov 2018 +''' + +import sys +import os + +from optparse import OptionParser, OptionGroup +from threading import Thread +import tempfile +import wave +import time +import threading +import logging + +try: + import future + from queue import Queue, Empty +except ImportError: + if sys.version_info >= (3, 0): + from queue import Queue, Empty + else: + from Queue import Queue, Empty + +try: + from pydub import AudioSegment + WITH_PYDUB = True +except ImportError: + WITH_PYDUB = False + + +from .core import StreamTokenizer +from .io import PyAudioSource, BufferAudioSource, StdinAudioSource, player_for +from .util import ADSFactory, AudioEnergyValidator +from auditok import __version__ as version + +__all__ = [] +__version__ = version +__date__ = '2015-11-23' +__updated__ = '2018-10-06' + +DEBUG = 0 +TESTRUN = 1 +PROFILE = 0 + +LOGGER_NAME = "AUDITOK_LOGGER" + +class AudioFileFormatError(Exception): + pass + +class TimeFormatError(Exception): + pass + +def file_to_audio_source(filename, filetype=None, **kwargs): + + lower_fname = filename.lower() + rawdata = False + + if filetype is not None: + filetype = filetype.lower() + + if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")): + + srate = kwargs.pop("sampling_rate", None) + if srate is None: + srate = kwargs.pop("sr", None) + + swidth = kwargs.pop("sample_width", None) + if swidth is None: + swidth = kwargs.pop("sw", None) + + ch = kwargs.pop("channels", None) + if ch is None: + ch = kwargs.pop("ch", None) + + if None in (swidth, srate, ch): + raise Exception("All audio parameters are required for raw data") + + data = open(filename).read() + rawdata = True + + # try first with pydub + if WITH_PYDUB: + + use_channel = kwargs.pop("use_channel", None) + if use_channel is None: + use_channel = kwargs.pop("uc", None) + + if use_channel is None: + use_channel = 1 + else: + try: + use_channel = int(use_channel) + except ValueError: + pass + + if not isinstance(use_channel, (int)) and not use_channel.lower() in ["left", "right", "mix"] : + raise ValueError("channel must be an integer or one of 'left', 'right' or 'mix'") + + asegment = None + + if rawdata: + asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch) + if filetype in("wave", "wav") or (filetype is None and lower_fname.endswith(".wav")): + asegment = AudioSegment.from_wav(filename) + elif filetype == "mp3" or (filetype is None and lower_fname.endswith(".mp3")): + asegment = AudioSegment.from_mp3(filename) + elif filetype == "ogg" or (filetype is None and lower_fname.endswith(".ogg")): + asegment = AudioSegment.from_ogg(filename) + elif filetype == "flv" or (filetype is None and lower_fname.endswith(".flv")): + asegment = AudioSegment.from_flv(filename) + else: + asegment = AudioSegment.from_file(filename) + + if asegment.channels > 1: + + if isinstance(use_channel, int): + if use_channel > asegment.channels: + raise ValueError("Can not use channel '{0}', audio file has only {1} channels".format(use_channel, asegment.channels)) + else: + asegment = asegment.split_to_mono()[use_channel - 1] + else: + ch_lower = use_channel.lower() + + if ch_lower == "mix": + asegment = asegment.set_channels(1) + + elif use_channel.lower() == "left": + asegment = asegment.split_to_mono()[0] + + elif use_channel.lower() == "right": + asegment = asegment.split_to_mono()[1] + + return BufferAudioSource(data_buffer = asegment._data, + sampling_rate = asegment.frame_rate, + sample_width = asegment.sample_width, + channels = asegment.channels) + # fall back to standard python + else: + if rawdata: + if ch != 1: + raise ValueError("Cannot handle multi-channel audio without pydub") + return BufferAudioSource(data, srate, swidth, ch) + + if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")): + + wfp = wave.open(filename) + + ch = wfp.getnchannels() + if ch != 1: + wfp.close() + raise ValueError("Cannot handle multi-channel audio without pydub") + + srate = wfp.getframerate() + swidth = wfp.getsampwidth() + data = wfp.readframes(wfp.getnframes()) + wfp.close() + return BufferAudioSource(data, srate, swidth, ch) + + raise AudioFileFormatError("Cannot read audio file format") + + +def save_audio_data(data, filename, filetype=None, **kwargs): + + lower_fname = filename.lower() + if filetype is not None: + filetype = filetype.lower() + + # save raw data + if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")): + fp = open(filename, "w") + fp.write(data) + fp.close() + return + + # save other types of data + # requires all audio parameters + srate = kwargs.pop("sampling_rate", None) + if srate is None: + srate = kwargs.pop("sr", None) + + swidth = kwargs.pop("sample_width", None) + if swidth is None: + swidth = kwargs.pop("sw", None) + + ch = kwargs.pop("channels", None) + if ch is None: + ch = kwargs.pop("ch", None) + + if None in (swidth, srate, ch): + raise Exception("All audio parameters are required to save no raw data") + + if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")): + # use standard python's wave module + fp = wave.open(filename, "w") + fp.setnchannels(ch) + fp.setsampwidth(swidth) + fp.setframerate(srate) + fp.writeframes(data) + fp.close() + + elif WITH_PYDUB: + + asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch) + asegment.export(filename, format=filetype) + + else: + raise AudioFileFormatError("cannot write file format {0} (file name: {1})".format(filetype, filename)) + + +def plot_all(signal, sampling_rate, energy_as_amp, detections=[], show=True, save_as=None): + + import matplotlib.pyplot as plt + import numpy as np + t = np.arange(0., np.ceil(float(len(signal))) / sampling_rate, 1./sampling_rate ) + if len(t) > len(signal): + t = t[: len(signal) - len(t)] + + for start, end in detections: + p = plt.axvspan(start, end, facecolor='g', ec = 'r', lw = 2, alpha=0.4) + + line = plt.axhline(y=energy_as_amp, lw=1, ls="--", c="r", label="Energy threshold as normalized amplitude") + plt.plot(t, signal) + legend = plt.legend(["Detection threshold"], bbox_to_anchor=(0., 1.02, 1., .102), loc=1, fontsize=16) + ax = plt.gca().add_artist(legend) + + plt.xlabel("Time (s)", fontsize=24) + plt.ylabel("Amplitude (normalized)", fontsize=24) + + if save_as is not None: + plt.savefig(save_as, dpi=120) + + if show: + plt.show() + + +def seconds_to_str_fromatter(_format): + """ + Accepted format directives: %i %s %m %h + """ + # check directives are correct + + if _format == "%S": + def _fromatter(seconds): + return "{:.2f}".format(seconds) + + elif _format == "%I": + def _fromatter(seconds): + return "{0}".format(int(seconds * 1000)) + + else: + _format = _format.replace("%h", "{hrs:02d}") + _format = _format.replace("%m", "{mins:02d}") + _format = _format.replace("%s", "{secs:02d}") + _format = _format.replace("%i", "{millis:03d}") + + try: + i = _format.index("%") + raise TimeFormatError("Unknow time format directive '{0}'".format(_format[i:i+2])) + except ValueError: + pass + + def _fromatter(seconds): + millis = int(seconds * 1000) + hrs, millis = divmod(millis, 3600000) + mins, millis = divmod(millis, 60000) + secs, millis = divmod(millis, 1000) + return _format.format(hrs=hrs, mins=mins, secs=secs, millis=millis) + + return _fromatter + + + +class Worker(Thread): + + def __init__(self, timeout=0.2, debug=False, logger=None): + self.timeout = timeout + self.debug = debug + self.logger = logger + + if self.debug and self.logger is None: + self.logger = logging.getLogger(LOGGER_NAME) + self.logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler(sys.stdout) + self.logger.addHandler(handler) + + self._inbox = Queue() + self._stop_request = Queue() + Thread.__init__(self) + + + def debug_message(self, message): + self.logger.debug(message) + + def _stop_requested(self): + + try: + message = self._stop_request.get_nowait() + if message == "stop": + return True + + except Empty: + return False + + def stop(self): + self._stop_request.put("stop") + self.join() + + def send(self, message): + self._inbox.put(message) + + def _get_message(self): + try: + message = self._inbox.get(timeout=self.timeout) + return message + except Empty: + return None + + +class TokenizerWorker(Worker): + + END_OF_PROCESSING = "END_OF_PROCESSING" + + def __init__(self, ads, tokenizer, analysis_window, observers): + self.ads = ads + self.tokenizer = tokenizer + self.analysis_window = analysis_window + self.observers = observers + self._inbox = Queue() + self.count = 0 + Worker.__init__(self) + + def run(self): + + def notify_observers(data, start, end): + audio_data = b''.join(data) + self.count += 1 + + start_time = start * self.analysis_window + end_time = (end+1) * self.analysis_window + duration = (end - start + 1) * self.analysis_window + + # notify observers + for observer in self.observers: + observer.notify({"id" : self.count, + "audio_data" : audio_data, + "start" : start, + "end" : end, + "start_time" : start_time, + "end_time" : end_time, + "duration" : duration} + ) + + self.ads.open() + self.tokenizer.tokenize(data_source=self, callback=notify_observers) + for observer in self.observers: + observer.notify(TokenizerWorker.END_OF_PROCESSING) + + def add_observer(self, observer): + self.observers.append(observer) + + def remove_observer(self, observer): + self.observers.remove(observer) + + def read(self): + if self._stop_requested(): + return None + else: + return self.ads.read() + + +class PlayerWorker(Worker): + + def __init__(self, player, timeout=0.2, debug=False, logger=None): + self.player = player + Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + if message is not None: + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + start_time = message.pop("start_time", None) + end_time = message.pop("end_time", None) + dur = message.pop("duration", None) + _id = message.pop("id", None) + + if audio_data is not None: + if self.debug: + self.debug_message("[PLAY]: Detection {id} played (start:{start}, end:{end}, dur:{dur})".format(id=_id, + start="{:5.2f}".format(start_time), end="{:5.2f}".format(end_time), dur="{:5.2f}".format(dur))) + self.player.play(audio_data) + + def notify(self, message): + self.send(message) + + +class CommandLineWorker(Worker): + + def __init__(self, command, timeout=0.2, debug=False, logger=None): + self.command = command + Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + if message is not None: + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + _id = message.pop("id", None) + if audio_data is not None: + raw_audio_file = tempfile.NamedTemporaryFile(delete=False) + raw_audio_file.write(audio_data) + cmd = self.command.replace("$", raw_audio_file.name) + if self.debug: + self.debug_message("[CMD ]: Detection {id} command: {cmd}".format(id=_id, cmd=cmd)) + os.system(cmd) + os.unlink(raw_audio_file.name) + + def notify(self, message): + self.send(message) + + +class TokenSaverWorker(Worker): + + def __init__(self, name_format, filetype, timeout=0.2, debug=False, logger=None, **kwargs): + self.name_format = name_format + self.filetype = filetype + self.kwargs = kwargs + Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + if message is not None: + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + start_time = message.pop("start_time", None) + end_time = message.pop("end_time", None) + _id = message.pop("id", None) + if audio_data is not None and len(audio_data) > 0: + fname = self.name_format.format(N=_id, start = "{:.2f}".format(start_time), end = "{:.2f}".format(end_time)) + try: + if self.debug: + self.debug_message("[SAVE]: Detection {id} saved as {fname}".format(id=_id, fname=fname)) + save_audio_data(audio_data, fname, filetype=self.filetype, **self.kwargs) + except Exception as e: + sys.stderr.write(str(e) + "\n") + + def notify(self, message): + self.send(message) + + +class LogWorker(Worker): + + def __init__(self, print_detections=False, output_format="{start} {end}", + time_formatter=seconds_to_str_fromatter("%S"), timeout=0.2, debug=False, logger=None): + + self.print_detections = print_detections + self.output_format = output_format + self.time_formatter = time_formatter + self.detections = [] + Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + + if message is not None: + + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + _id = message.pop("id", None) + start = message.pop("start", None) + end = message.pop("end", None) + start_time = message.pop("start_time", None) + end_time = message.pop("end_time", None) + duration = message.pop("duration", None) + if audio_data is not None and len(audio_data) > 0: + + if self.debug: + self.debug_message("[DET ]: Detection {id} (start:{start}, end:{end})".format(id=_id, + start="{:5.2f}".format(start_time), + end="{:5.2f}".format(end_time))) + + if self.print_detections: + print(self.output_format.format(id = _id, + start = self.time_formatter(start_time), + end = self.time_formatter(end_time), duration = self.time_formatter(duration))) + + self.detections.append((_id, start, end, start_time, end_time)) + + + def notify(self, message): + self.send(message) + + + +def main(argv=None): + '''Command line options.''' + + program_name = os.path.basename(sys.argv[0]) + program_version = version + program_build_date = "%s" % __updated__ + + program_version_string = '%%prog %s (%s)' % (program_version, program_build_date) + #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse + program_longdesc = '''''' # optional - give further explanation about what the program does + program_license = "Copyright 2015-2018 Mohamed El Amine SEHILI \ + Licensed under the General Public License (GPL) Version 3 \nhttp://www.gnu.org/licenses/" + + if argv is None: + argv = sys.argv[1:] + try: + # setup option parser + parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license) + + group = OptionGroup(parser, "[Input-Output options]") + group.add_option("-i", "--input", dest="input", help="Input audio or video file. Use - for stdin [default: read from microphone using pyaudio]", metavar="FILE") + group.add_option("-t", "--input-type", dest="input_type", help="Input audio file type. Mandatory if file name has no extension [default: %default]", type=str, default=None, metavar="String") + group.add_option("-M", "--max_time", dest="max_time", help="Max data (in seconds) to read from microphone/file [default: read until the end of file/stream]", type=float, default=None, metavar="FLOAT") + group.add_option("-O", "--output-main", dest="output_main", help="Save main stream as. If omitted main stream will not be saved [default: omitted]", type=str, default=None, metavar="FILE") + group.add_option("-o", "--output-tokens", dest="output_tokens", help="Output file name format for detections. Use {N} and {start} and {end} to build file names, example: 'Det_{N}_{start}-{end}.wav'", type=str, default=None, metavar="STRING") + group.add_option("-T", "--output-type", dest="output_type", help="Audio type used to save detections and/or main stream. If not supplied will: (1). guess from extension or (2). use wav format", type=str, default=None, metavar="STRING") + group.add_option("-u", "--use-channel", dest="use_channel", help="Choose channel to use from a multi-channel audio file (requires pydub). 'left', 'right' and 'mix' are accepted values. [Default: 1 (i.e. 1st or left channel)]", type=str, default="1", metavar="STRING") + parser.add_option_group(group) + + + group = OptionGroup(parser, "[Tokenization options]", "Set tokenizer options and energy threshold.") + group.add_option("-a", "--analysis-window", dest="analysis_window", help="Size of analysis window in seconds [default: %default (10ms)]", type=float, default=0.01, metavar="FLOAT") + group.add_option("-n", "--min-duration", dest="min_duration", help="Min duration of a valid audio event in seconds [default: %default]", type=float, default=0.2, metavar="FLOAT") + group.add_option("-m", "--max-duration", dest="max_duration", help="Max duration of a valid audio event in seconds [default: %default]", type=float, default=5, metavar="FLOAT") + group.add_option("-s", "--max-silence", dest="max_silence", help="Max duration of a consecutive silence within a valid audio event in seconds [default: %default]", type=float, default=0.3, metavar="FLOAT") + group.add_option("-d", "--drop-trailing-silence", dest="drop_trailing_silence", help="Drop trailing silence from a detection [default: keep trailing silence]", action="store_true", default=False) + group.add_option("-e", "--energy-threshold", dest="energy_threshold", help="Log energy threshold for detection [default: %default]", type=float, default=50, metavar="FLOAT") + parser.add_option_group(group) + + + group = OptionGroup(parser, "[Audio parameters]", "Define audio parameters if data is read from a headerless file (raw or stdin) or you want to use different microphone parameters.") + group.add_option("-r", "--rate", dest="sampling_rate", help="Sampling rate of audio data [default: %default]", type=int, default=16000, metavar="INT") + group.add_option("-c", "--channels", dest="channels", help="Number of channels of audio data [default: %default]", type=int, default=1, metavar="INT") + group.add_option("-w", "--width", dest="sample_width", help="Number of bytes per audio sample [default: %default]", type=int, default=2, metavar="INT") + group.add_option("-I", "--input-device-index", dest="input_device_index", help="Audio device index [default: %default] - only when using PyAudio", type=int, default=None, metavar="INT") + group.add_option("-F", "--audio-frame-per-buffer", dest="frame_per_buffer", help="Audio frame per buffer [default: %default] - only when using PyAudio", type=int, default=1024, metavar="INT") + parser.add_option_group(group) + + group = OptionGroup(parser, "[Do something with detections]", "Use these options to print, play or plot detections.") + group.add_option("-C", "--command", dest="command", help="Command to call when an audio detection occurs. Use $ to represent the file name to use with the command (e.g. -C 'du -h $')", default=None, type=str, metavar="STRING") + group.add_option("-E", "--echo", dest="echo", help="Play back each detection immediately using pyaudio [default: do not play]", action="store_true", default=False) + group.add_option("-p", "--plot", dest="plot", help="Plot and show audio signal and detections (requires matplotlib)", action="store_true", default=False) + group.add_option("", "--save-image", dest="save_image", help="Save plotted audio signal and detections as a picture or a PDF file (requires matplotlib)", type=str, default=None, metavar="FILE") + group.add_option("", "--printf", dest="printf", help="print detections, one per line, using a user supplied format (e.g. '[{id}]: {start} -- {end}'). Available keywords {id}, {start}, {end} and {duration}", type=str, default="{id} {start} {end}", metavar="STRING") + group.add_option("", "--time-format", dest="time_format", help="format used to print {start} and {end}. [Default= %default]. %S: absolute time in sec. %I: absolute time in ms. If at least one of (%h, %m, %s, %i) is used, convert time into hours, minutes, seconds and millis (e.g. %h:%m:%s.%i). Only required fields are printed", type=str, default="%S", metavar="STRING") + parser.add_option_group(group) + + parser.add_option("-q", "--quiet", dest="quiet", help="Do not print any information about detections [default: print 'id', 'start' and 'end' of each detection]", action="store_true", default=False) + parser.add_option("-D", "--debug", dest="debug", help="Print processing operations to STDOUT", action="store_true", default=False) + parser.add_option("", "--debug-file", dest="debug_file", help="Print processing operations to FILE", type=str, default=None, metavar="FILE") + + + + # process options + (opts, args) = parser.parse_args(argv) + + if opts.input == "-": + asource = StdinAudioSource(sampling_rate = opts.sampling_rate, + sample_width = opts.sample_width, + channels = opts.channels) + #read data from a file + elif opts.input is not None: + asource = file_to_audio_source(filename=opts.input, filetype=opts.input_type, uc=opts.use_channel) + + # read data from microphone via pyaudio + else: + try: + asource = PyAudioSource(sampling_rate = opts.sampling_rate, + sample_width = opts.sample_width, + channels = opts.channels, + frames_per_buffer = opts.frame_per_buffer, + input_device_index = opts.input_device_index) + except Exception: + sys.stderr.write("Cannot read data from audio device!\n") + sys.stderr.write("You should either install pyaudio or read data from STDIN\n") + sys.exit(2) + + logger = logging.getLogger(LOGGER_NAME) + logger.setLevel(logging.DEBUG) + + handler = logging.StreamHandler(sys.stdout) + if opts.quiet or not opts.debug: + # only critical messages will be printed + handler.setLevel(logging.CRITICAL) + else: + handler.setLevel(logging.DEBUG) + + logger.addHandler(handler) + + if opts.debug_file is not None: + logger.setLevel(logging.DEBUG) + opts.debug = True + handler = logging.FileHandler(opts.debug_file, "w") + fmt = logging.Formatter('[%(asctime)s] | %(message)s') + handler.setFormatter(fmt) + handler.setLevel(logging.DEBUG) + logger.addHandler(handler) + + record = opts.output_main is not None or opts.plot or opts.save_image is not None + + ads = ADSFactory.ads(audio_source = asource, block_dur = opts.analysis_window, max_time = opts.max_time, record = record) + validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=opts.energy_threshold) + + + if opts.drop_trailing_silence: + mode = StreamTokenizer.DROP_TRAILING_SILENCE + else: + mode = 0 + + analysis_window_per_second = 1. / opts.analysis_window + tokenizer = StreamTokenizer(validator=validator, min_length=opts.min_duration * analysis_window_per_second, + max_length=int(opts.max_duration * analysis_window_per_second), + max_continuous_silence=opts.max_silence * analysis_window_per_second, + mode = mode) + + + observers = [] + tokenizer_worker = None + + if opts.output_tokens is not None: + + try: + # check user format is correct + fname = opts.output_tokens.format(N=0, start=0, end=0) + + # find file type for detections + tok_type = opts.output_type + if tok_type is None: + tok_type = os.path.splitext(opts.output_tokens)[1][1:] + if tok_type == "": + tok_type = "wav" + + token_saver = TokenSaverWorker(name_format=opts.output_tokens, filetype=tok_type, + debug=opts.debug, logger=logger, sr=asource.get_sampling_rate(), + sw=asource.get_sample_width(), + ch=asource.get_channels()) + observers.append(token_saver) + + except Exception: + sys.stderr.write("Wrong format for detections file name: '{0}'\n".format(opts.output_tokens)) + sys.exit(2) + + if opts.echo: + try: + player = player_for(asource) + player_worker = PlayerWorker(player=player, debug=opts.debug, logger=logger) + observers.append(player_worker) + except Exception: + sys.stderr.write("Cannot get an audio player!\n") + sys.stderr.write("You should either install pyaudio or supply a command (-C option) to play audio\n") + sys.exit(2) + + if opts.command is not None and len(opts.command) > 0: + cmd_worker = CommandLineWorker(command=opts.command, debug=opts.debug, logger=logger) + observers.append(cmd_worker) + + if not opts.quiet or opts.plot is not None or opts.save_image is not None: + oformat = opts.printf.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "\r") + converter = seconds_to_str_fromatter(opts.time_format) + log_worker = LogWorker(print_detections = not opts.quiet, output_format=oformat, + time_formatter=converter, logger=logger, debug=opts.debug) + observers.append(log_worker) + + tokenizer_worker = TokenizerWorker(ads, tokenizer, opts.analysis_window, observers) + + def _save_main_stream(): + # find file type + main_type = opts.output_type + if main_type is None: + main_type = os.path.splitext(opts.output_main)[1][1:] + if main_type == "": + main_type = "wav" + ads.close() + ads.rewind() + data = ads.get_audio_source().get_data_buffer() + if len(data) > 0: + save_audio_data(data=data, filename=opts.output_main, filetype=main_type, sr=asource.get_sampling_rate(), + sw = asource.get_sample_width(), + ch = asource.get_channels()) + + def _plot(): + import numpy as np + ads.close() + ads.rewind() + data = ads.get_audio_source().get_data_buffer() + signal = AudioEnergyValidator._convert(data, asource.get_sample_width()) + detections = [(det[3] , det[4]) for det in log_worker.detections] + max_amplitude = 2**(asource.get_sample_width() * 8 - 1) - 1 + energy_as_amp = np.sqrt(np.exp(opts.energy_threshold * np.log(10) / 10)) / max_amplitude + plot_all(signal / max_amplitude, asource.get_sampling_rate(), energy_as_amp, detections, show = opts.plot, save_as = opts.save_image) + + + # start observer threads + for obs in observers: + obs.start() + # start tokenization thread + tokenizer_worker.start() + + while True: + time.sleep(1) + if len(threading.enumerate()) == 1: + break + + tokenizer_worker = None + + if opts.output_main is not None: + _save_main_stream() + if opts.plot or opts.save_image is not None: + _plot() + + return 0 + + except KeyboardInterrupt: + + if tokenizer_worker is not None: + tokenizer_worker.stop() + for obs in observers: + obs.stop() + + if opts.output_main is not None: + _save_main_stream() + if opts.plot or opts.save_image is not None: + _plot() + + return 0 + + except Exception as e: + sys.stderr.write(program_name + ": " + str(e) + "\n") + sys.stderr.write("for help use -h\n") + + return 2 + +if __name__ == "__main__": + if DEBUG: + sys.argv.append("-h") + if TESTRUN: + import doctest + doctest.testmod() + if PROFILE: + import cProfile + import pstats + profile_filename = 'auditok.auditok_profile.txt' + cProfile.run('main()', profile_filename) + statsfile = open("profile_stats.txt", "wb") + p = pstats.Stats(profile_filename, stream=statsfile) + stats = p.strip_dirs().sort_stats('cumulative') + stats.print_stats() + statsfile.close() + sys.exit(0) + sys.exit(main()) diff --git a/libs/auditok/core.py b/libs/auditok/core.py new file mode 100644 index 000000000..fa2ab598c --- /dev/null +++ b/libs/auditok/core.py @@ -0,0 +1,437 @@ +""" +This module gathers processing (i.e. tokenization) classes. + +Class summary +============= + +.. autosummary:: + + StreamTokenizer +""" + +from auditok.util import DataValidator + +__all__ = ["StreamTokenizer"] + + +class StreamTokenizer(): + """ + Class for stream tokenizers. It implements a 4-state automaton scheme + to extract sub-sequences of interest on the fly. + + :Parameters: + + `validator` : + instance of `DataValidator` that implements `is_valid` method. + + `min_length` : *(int)* + Minimum number of frames of a valid token. This includes all \ + tolerated non valid frames within the token. + + `max_length` : *(int)* + Maximum number of frames of a valid token. This includes all \ + tolerated non valid frames within the token. + + `max_continuous_silence` : *(int)* + Maximum number of consecutive non-valid frames within a token. + Note that, within a valid token, there may be many tolerated \ + *silent* regions that contain each a number of non valid frames up to \ + `max_continuous_silence` + + `init_min` : *(int, default=0)* + Minimum number of consecutive valid frames that must be **initially** \ + gathered before any sequence of non valid frames can be tolerated. This + option is not always needed, it can be used to drop non-valid tokens as + early as possible. **Default = 0** means that the option is by default + ineffective. + + `init_max_silence` : *(int, default=0)* + Maximum number of tolerated consecutive non-valid frames if the \ + number already gathered valid frames has not yet reached 'init_min'. + This argument is normally used if `init_min` is used. **Default = 0**, + by default this argument is not taken into consideration. + + `mode` : *(int, default=0)* + `mode` can be: + + 1. `StreamTokenizer.STRICT_MIN_LENGTH`: + if token *i* is delivered because `max_length` + is reached, and token *i+1* is immediately adjacent to + token *i* (i.e. token *i* ends at frame *k* and token *i+1* starts + at frame *k+1*) then accept token *i+1* only of it has a size of at + least `min_length`. The default behavior is to accept token *i+1* + event if it is shorter than `min_length` (given that the above conditions + are fulfilled of course). + + :Examples: + + In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is + accepted although it is shorter than `min_length` (3), because it immediately + follows the latest delivered token: + + .. code:: python + + from auditok import StreamTokenizer, StringDataSource, DataValidator + + class UpperCaseChecker(DataValidator): + def is_valid(self, frame): + return frame.isupper() + + + dsource = StringDataSource("aaaAAAABBbbb") + tokenizer = StreamTokenizer(validator=UpperCaseChecker(), + min_length=3, + max_length=4, + max_continuous_silence=0) + + tokenizer.tokenize(dsource) + + :output: + + .. code:: python + + [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)] + + + The following tokenizer will however reject the 'BB' token: + + .. code:: python + + dsource = StringDataSource("aaaAAAABBbbb") + tokenizer = StreamTokenizer(validator=UpperCaseChecker(), + min_length=3, max_length=4, + max_continuous_silence=0, + mode=StreamTokenizer.STRICT_MIN_LENGTH) + tokenizer.tokenize(dsource) + + :output: + + .. code:: python + + [(['A', 'A', 'A', 'A'], 3, 6)] + + + 2. `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing non-valid frames + from a token to be delivered if and only if it is not **truncated**. + This can be a bit tricky. A token is actually delivered if: + + - a. `max_continuous_silence` is reached + + :or: + + - b. Its length reaches `max_length`. This is called a **truncated** token + + In the current implementation, a `StreamTokenizer`'s decision is only based on already seen + data and on incoming data. Thus, if a token is truncated at a non-valid but tolerated + frame (`max_length` is reached but `max_continuous_silence` not yet) any tailing + silence will be kept because it can potentially be part of valid token (if `max_length` + was bigger). But if `max_continuous_silence` is reached before `max_length`, the delivered + token will not be considered as truncated but a result of *normal* end of detection + (i.e. no more valid data). In that case the tailing silence can be removed if you use + the `StreamTokenizer.DROP_TRAILING_SILENCE` mode. + + :Example: + + .. code:: python + + tokenizer = StreamTokenizer(validator=UpperCaseChecker(), min_length=3, + max_length=6, max_continuous_silence=3, + mode=StreamTokenizer.DROP_TRAILING_SILENCE) + + dsource = StringDataSource("aaaAAAaaaBBbbbb") + tokenizer.tokenize(dsource) + + :output: + + .. code:: python + + [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)] + + The first token is delivered with its tailing silence because it is truncated + while the second one has its tailing frames removed. + + Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be: + + .. code:: python + + [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13)] + + + 3. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`: + use both options. That means: first remove tailing silence, then ckeck if the + token still has at least a length of `min_length`. + """ + + SILENCE = 0 + POSSIBLE_SILENCE = 1 + POSSIBLE_NOISE = 2 + NOISE = 3 + + STRICT_MIN_LENGTH = 2 + DROP_TRAILING_SILENCE = 4 + # alias + DROP_TAILING_SILENCE = 4 + + def __init__(self, validator, + min_length, max_length, max_continuous_silence, + init_min=0, init_max_silence=0, + mode=0): + + if not isinstance(validator, DataValidator): + raise TypeError("'validator' must be an instance of 'DataValidator'") + + if max_length <= 0: + raise ValueError("'max_length' must be > 0 (value={0})".format(max_length)) + + if min_length <= 0 or min_length > max_length: + raise ValueError("'min_length' must be > 0 and <= 'max_length' (value={0})".format(min_length)) + + if max_continuous_silence >= max_length: + raise ValueError("'max_continuous_silence' must be < 'max_length' (value={0})".format(max_continuous_silence)) + + if init_min >= max_length: + raise ValueError("'init_min' must be < 'max_length' (value={0})".format(max_continuous_silence)) + + self.validator = validator + self.min_length = min_length + self.max_length = max_length + self.max_continuous_silence = max_continuous_silence + self.init_min = init_min + self.init_max_silent = init_max_silence + + self._mode = None + self.set_mode(mode) + self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0 + self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0 + + self._deliver = None + self._tokens = None + self._state = None + self._data = None + self._contiguous_token = False + + self._init_count = 0 + self._silence_length = 0 + self._start_frame = 0 + self._current_frame = 0 + + def set_mode(self, mode): + """ + :Parameters: + + `mode` : *(int)* + New mode, must be one of: + + + - `StreamTokenizer.STRICT_MIN_LENGTH` + + - `StreamTokenizer.DROP_TRAILING_SILENCE` + + - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE` + + - `0` + + See `StreamTokenizer.__init__` for more information about the mode. + """ + + if not mode in [self.STRICT_MIN_LENGTH, self.DROP_TRAILING_SILENCE, + self.STRICT_MIN_LENGTH | self.DROP_TRAILING_SILENCE, 0]: + + raise ValueError("Wrong value for mode") + + self._mode = mode + self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0 + self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0 + + def get_mode(self): + """ + Return the current mode. To check whether a specific mode is activated use + the bitwise 'and' operator `&`. Example: + + .. code:: python + + if mode & self.STRICT_MIN_LENGTH != 0: + do_something() + """ + return self._mode + + def _reinitialize(self): + self._contiguous_token = False + self._data = [] + self._tokens = [] + self._state = self.SILENCE + self._current_frame = -1 + self._deliver = self._append_token + + def tokenize(self, data_source, callback=None): + """ + Read data from `data_source`, one frame a time, and process the read frames in + order to detect sequences of frames that make up valid tokens. + + :Parameters: + `data_source` : instance of the :class:`DataSource` class that implements a `read` method. + 'read' should return a slice of signal, i.e. frame (of whatever \ + type as long as it can be processed by validator) and None if \ + there is no more signal. + + `callback` : an optional 3-argument function. + If a `callback` function is given, it will be called each time a valid token + is found. + + + :Returns: + A list of tokens if `callback` is None. Each token is tuple with the following elements: + + .. code python + + (data, start, end) + + where `data` is a list of read frames, `start`: index of the first frame in the + original data and `end` : index of the last frame. + + """ + + self._reinitialize() + + if callback is not None: + self._deliver = callback + + while True: + frame = data_source.read() + if frame is None: + break + self._current_frame += 1 + self._process(frame) + + self._post_process() + + if callback is None: + _ret = self._tokens + self._tokens = None + return _ret + + def _process(self, frame): + + frame_is_valid = self.validator.is_valid(frame) + + if self._state == self.SILENCE: + + if frame_is_valid: + # seems we got a valid frame after a silence + self._init_count = 1 + self._silence_length = 0 + self._start_frame = self._current_frame + self._data.append(frame) + + if self._init_count >= self.init_min: + self._state = self.NOISE + if len(self._data) >= self.max_length: + self._process_end_of_detection(True) + else: + self._state = self.POSSIBLE_NOISE + + elif self._state == self.POSSIBLE_NOISE: + + if frame_is_valid: + self._silence_length = 0 + self._init_count += 1 + self._data.append(frame) + if self._init_count >= self.init_min: + self._state = self.NOISE + if len(self._data) >= self.max_length: + self._process_end_of_detection(True) + + else: + self._silence_length += 1 + if self._silence_length > self.init_max_silent or \ + len(self._data) + 1 >= self.max_length: + # either init_max_silent or max_length is reached + # before _init_count, back to silence + self._data = [] + self._state = self.SILENCE + else: + self._data.append(frame) + + elif self._state == self.NOISE: + + if frame_is_valid: + self._data.append(frame) + if len(self._data) >= self.max_length: + self._process_end_of_detection(True) + + elif self.max_continuous_silence <= 0: + # max token reached at this frame will _deliver if _contiguous_token + # and not _strict_min_length + self._process_end_of_detection() + self._state = self.SILENCE + + else: + # this is the first silent frame following a valid one + # and it is tolerated + self._silence_length = 1 + self._data.append(frame) + self._state = self.POSSIBLE_SILENCE + if len(self._data) == self.max_length: + self._process_end_of_detection(True) + # don't reset _silence_length because we still + # need to know the total number of silent frames + + elif self._state == self.POSSIBLE_SILENCE: + + if frame_is_valid: + self._data.append(frame) + self._silence_length = 0 + self._state = self.NOISE + if len(self._data) >= self.max_length: + self._process_end_of_detection(True) + + else: + if self._silence_length >= self.max_continuous_silence: + if self._silence_length < len(self._data): + # _deliver only gathered frames aren't all silent + self._process_end_of_detection() + else: + self._data = [] + self._state = self.SILENCE + self._silence_length = 0 + else: + self._data.append(frame) + self._silence_length += 1 + if len(self._data) >= self.max_length: + self._process_end_of_detection(True) + # don't reset _silence_length because we still + # need to know the total number of silent frames + + def _post_process(self): + if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE: + if len(self._data) > 0 and len(self._data) > self._silence_length: + self._process_end_of_detection() + + def _process_end_of_detection(self, truncated=False): + + if not truncated and self._drop_tailing_silence and self._silence_length > 0: + # happens if max_continuous_silence is reached + # or max_length is reached at a silent frame + self._data = self._data[0: - self._silence_length] + + if (len(self._data) >= self.min_length) or \ + (len(self._data) > 0 and + not self._strict_min_length and self._contiguous_token): + + _end_frame = self._start_frame + len(self._data) - 1 + self._deliver(self._data, self._start_frame, _end_frame) + + if truncated: + # next token (if any) will start at _current_frame + 1 + self._start_frame = self._current_frame + 1 + # remember that it is contiguous with the just delivered one + self._contiguous_token = True + else: + self._contiguous_token = False + else: + self._contiguous_token = False + + self._data = [] + + def _append_token(self, data, start, end): + self._tokens.append((data, start, end)) diff --git a/libs/auditok/data/1to6arabic_16000_mono_bc_noise.wav b/libs/auditok/data/1to6arabic_16000_mono_bc_noise.wav Binary files differnew file mode 100644 index 000000000..3339b8a2c --- /dev/null +++ b/libs/auditok/data/1to6arabic_16000_mono_bc_noise.wav diff --git a/libs/auditok/data/was_der_mensch_saet_das_wird_er_vielfach_ernten_44100Hz_mono_lead_trail_silence.wav b/libs/auditok/data/was_der_mensch_saet_das_wird_er_vielfach_ernten_44100Hz_mono_lead_trail_silence.wav Binary files differnew file mode 100644 index 000000000..b3056b91a --- /dev/null +++ b/libs/auditok/data/was_der_mensch_saet_das_wird_er_vielfach_ernten_44100Hz_mono_lead_trail_silence.wav diff --git a/libs/auditok/dataset.py b/libs/auditok/dataset.py new file mode 100644 index 000000000..dbee8f61e --- /dev/null +++ b/libs/auditok/dataset.py @@ -0,0 +1,18 @@ +""" +This module contains links to audio files you can use for test purposes. +""" + +import os + +__all__ = ["one_to_six_arabic_16000_mono_bc_noise", "was_der_mensch_saet_mono_44100_lead_trail_silence"] + +_current_dir = os.path.dirname(os.path.realpath(__file__)) + +one_to_six_arabic_16000_mono_bc_noise = "{cd}{sep}data{sep}1to6arabic_\ +16000_mono_bc_noise.wav".format(cd=_current_dir, sep=os.path.sep) +"""A wave file that contains a pronunciation of Arabic numbers from 1 to 6""" + +was_der_mensch_saet_mono_44100_lead_trail_silence = "{cd}{sep}data{sep}was_\ +der_mensch_saet_das_wird_er_vielfach_ernten_44100Hz_mono_lead_trail_\ +silence.wav".format(cd=_current_dir, sep=os.path.sep) +""" A wave file that contains a sentence between long leading and trailing periods of silence""" diff --git a/libs/auditok/exceptions.py b/libs/auditok/exceptions.py new file mode 100644 index 000000000..f3d0354b0 --- /dev/null +++ b/libs/auditok/exceptions.py @@ -0,0 +1,3 @@ + +class DuplicateArgument(Exception): + pass diff --git a/libs/auditok/io.py b/libs/auditok/io.py new file mode 100644 index 000000000..772147f1c --- /dev/null +++ b/libs/auditok/io.py @@ -0,0 +1,517 @@ +""" +Module for low-level audio input-output operations. + +Class summary +============= + +.. autosummary:: + + AudioSource + Rewindable + BufferAudioSource + WaveAudioSource + PyAudioSource + StdinAudioSource + PyAudioPlayer + + +Function summary +================ + +.. autosummary:: + + from_file + player_for +""" + +from abc import ABCMeta, abstractmethod +import wave +import sys + +__all__ = ["AudioSource", "Rewindable", "BufferAudioSource", "WaveAudioSource", + "PyAudioSource", "StdinAudioSource", "PyAudioPlayer", "from_file", "player_for"] + +DEFAULT_SAMPLE_RATE = 16000 +DEFAULT_SAMPLE_WIDTH = 2 +DEFAULT_NB_CHANNELS = 1 + + +class AudioSource(): + """ + Base class for audio source objects. + + Subclasses should implement methods to open/close and audio stream + and read the desired amount of audio samples. + + :Parameters: + + `sampling_rate` : int + Number of samples per second of audio stream. Default = 16000. + + `sample_width` : int + Size in bytes of one audio sample. Possible values : 1, 2, 4. + Default = 2. + + `channels` : int + Number of channels of audio stream. The current version supports + only mono audio streams (i.e. one channel). + """ + + __metaclass__ = ABCMeta + + def __init__(self, sampling_rate=DEFAULT_SAMPLE_RATE, + sample_width=DEFAULT_SAMPLE_WIDTH, + channels=DEFAULT_NB_CHANNELS): + + if not sample_width in (1, 2, 4): + raise ValueError("Sample width must be one of: 1, 2 or 4 (bytes)") + + if channels != 1: + raise ValueError("Only mono audio is currently handled") + + self._sampling_rate = sampling_rate + self._sample_width = sample_width + self._channels = channels + + @abstractmethod + def is_open(self): + """ Return True if audio source is open, False otherwise """ + + @abstractmethod + def open(self): + """ Open audio source """ + + @abstractmethod + def close(self): + """ Close audio source """ + + @abstractmethod + def read(self, size): + """ + Read and return `size` audio samples at most. + + :Parameters: + + `size` : int + the number of samples to read. + + :Returns: + + Audio data as a string of length 'N' * 'sample_width' * 'channels', where 'N' is: + + - `size` if `size` < 'left_samples' + + - 'left_samples' if `size` > 'left_samples' + """ + + def get_sampling_rate(self): + """ Return the number of samples per second of audio stream """ + return self.sampling_rate + + @property + def sampling_rate(self): + """ Number of samples per second of audio stream """ + return self._sampling_rate + + @property + def sr(self): + """ Number of samples per second of audio stream """ + return self._sampling_rate + + def get_sample_width(self): + """ Return the number of bytes used to represent one audio sample """ + return self.sample_width + + @property + def sample_width(self): + """ Number of bytes used to represent one audio sample """ + return self._sample_width + + @property + def sw(self): + """ Number of bytes used to represent one audio sample """ + return self._sample_width + + def get_channels(self): + """ Return the number of channels of this audio source """ + return self.channels + + @property + def channels(self): + """ Number of channels of this audio source """ + return self._channels + + @property + def ch(self): + """ Return the number of channels of this audio source """ + return self.channels + + +class Rewindable(): + """ + Base class for rewindable audio streams. + Subclasses should implement methods to return to the beginning of an + audio stream as well as method to move to an absolute audio position + expressed in time or in number of samples. + """ + + __metaclass__ = ABCMeta + + @abstractmethod + def rewind(self): + """ Go back to the beginning of audio stream """ + pass + + @abstractmethod + def get_position(self): + """ Return the total number of already read samples """ + + @abstractmethod + def get_time_position(self): + """ Return the total duration in seconds of already read data """ + + @abstractmethod + def set_position(self, position): + """ Move to an absolute position + + :Parameters: + + `position` : int + number of samples to skip from the start of the stream + """ + + @abstractmethod + def set_time_position(self, time_position): + """ Move to an absolute position expressed in seconds + + :Parameters: + + `time_position` : float + seconds to skip from the start of the stream + """ + pass + + +class BufferAudioSource(AudioSource, Rewindable): + """ + An :class:`AudioSource` that encapsulates and reads data from a memory buffer. + It implements methods from :class:`Rewindable` and is therefore a navigable :class:`AudioSource`. + """ + + def __init__(self, data_buffer, + sampling_rate=DEFAULT_SAMPLE_RATE, + sample_width=DEFAULT_SAMPLE_WIDTH, + channels=DEFAULT_NB_CHANNELS): + + if len(data_buffer) % (sample_width * channels) != 0: + raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)") + + AudioSource.__init__(self, sampling_rate, sample_width, channels) + self._buffer = data_buffer + self._index = 0 + self._left = 0 if self._buffer is None else len(self._buffer) + self._is_open = False + + def is_open(self): + return self._is_open + + def open(self): + self._is_open = True + + def close(self): + self._is_open = False + self.rewind() + + def read(self, size): + if not self._is_open: + raise IOError("Stream is not open") + + if self._left > 0: + + to_read = size * self.sample_width * self.channels + if to_read > self._left: + to_read = self._left + + data = self._buffer[self._index: self._index + to_read] + self._index += to_read + self._left -= to_read + + return data + + return None + + def get_data_buffer(self): + """ Return all audio data as one string buffer. """ + return self._buffer + + def set_data(self, data_buffer): + """ Set new data for this audio stream. + + :Parameters: + + `data_buffer` : str, basestring, Bytes + a string buffer with a length multiple of (sample_width * channels) + """ + if len(data_buffer) % (self.sample_width * self.channels) != 0: + raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)") + self._buffer = data_buffer + self._index = 0 + self._left = 0 if self._buffer is None else len(self._buffer) + + def append_data(self, data_buffer): + """ Append data to this audio stream + + :Parameters: + + `data_buffer` : str, basestring, Bytes + a buffer with a length multiple of (sample_width * channels) + """ + + if len(data_buffer) % (self.sample_width * self.channels) != 0: + raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)") + + self._buffer += data_buffer + self._left += len(data_buffer) + + def rewind(self): + self.set_position(0) + + def get_position(self): + return self._index / self.sample_width + + def get_time_position(self): + return float(self._index) / (self.sample_width * self.sampling_rate) + + def set_position(self, position): + if position < 0: + raise ValueError("position must be >= 0") + + if self._buffer is None: + self._index = 0 + self._left = 0 + return + + position *= self.sample_width + self._index = position if position < len(self._buffer) else len(self._buffer) + self._left = len(self._buffer) - self._index + + def set_time_position(self, time_position): # time in seconds + position = int(self.sampling_rate * time_position) + self.set_position(position) + + +class WaveAudioSource(AudioSource): + """ + A class for an `AudioSource` that reads data from a wave file. + + :Parameters: + + `filename` : + path to a valid wave file + """ + + def __init__(self, filename): + + self._filename = filename + self._audio_stream = None + + stream = wave.open(self._filename) + AudioSource.__init__(self, stream.getframerate(), + stream.getsampwidth(), + stream.getnchannels()) + stream.close() + + def is_open(self): + return self._audio_stream is not None + + def open(self): + if(self._audio_stream is None): + self._audio_stream = wave.open(self._filename) + + def close(self): + if self._audio_stream is not None: + self._audio_stream.close() + self._audio_stream = None + + def read(self, size): + if self._audio_stream is None: + raise IOError("Stream is not open") + else: + data = self._audio_stream.readframes(size) + if data is None or len(data) < 1: + return None + return data + + +class PyAudioSource(AudioSource): + """ + A class for an `AudioSource` that reads data the built-in microphone using PyAudio. + """ + + def __init__(self, sampling_rate=DEFAULT_SAMPLE_RATE, + sample_width=DEFAULT_SAMPLE_WIDTH, + channels=DEFAULT_NB_CHANNELS, + frames_per_buffer=1024, + input_device_index=None): + + AudioSource.__init__(self, sampling_rate, sample_width, channels) + self._chunk_size = frames_per_buffer + self.input_device_index = input_device_index + + import pyaudio + self._pyaudio_object = pyaudio.PyAudio() + self._pyaudio_format = self._pyaudio_object.get_format_from_width(self.sample_width) + self._audio_stream = None + + def is_open(self): + return self._audio_stream is not None + + def open(self): + self._audio_stream = self._pyaudio_object.open(format=self._pyaudio_format, + channels=self.channels, + rate=self.sampling_rate, + input=True, + output=False, + input_device_index=self.input_device_index, + frames_per_buffer=self._chunk_size) + + def close(self): + if self._audio_stream is not None: + self._audio_stream.stop_stream() + self._audio_stream.close() + self._audio_stream = None + + def read(self, size): + if self._audio_stream is None: + raise IOError("Stream is not open") + + if self._audio_stream.is_active(): + data = self._audio_stream.read(size) + if data is None or len(data) < 1: + return None + return data + + return None + + +class StdinAudioSource(AudioSource): + """ + A class for an :class:`AudioSource` that reads data from standard input. + """ + + def __init__(self, sampling_rate=DEFAULT_SAMPLE_RATE, + sample_width=DEFAULT_SAMPLE_WIDTH, + channels=DEFAULT_NB_CHANNELS): + + AudioSource.__init__(self, sampling_rate, sample_width, channels) + self._is_open = False + + def is_open(self): + return self._is_open + + def open(self): + self._is_open = True + + def close(self): + self._is_open = False + + def read(self, size): + if not self._is_open: + raise IOError("Stream is not open") + + to_read = size * self.sample_width * self.channels + if sys.version_info >= (3, 0): + data = sys.stdin.buffer.read(to_read) + else: + data = sys.stdin.read(to_read) + + if data is None or len(data) < 1: + return None + + return data + + +class PyAudioPlayer(): + """ + A class for audio playback using Pyaudio + """ + + def __init__(self, sampling_rate=DEFAULT_SAMPLE_RATE, + sample_width=DEFAULT_SAMPLE_WIDTH, + channels=DEFAULT_NB_CHANNELS): + if not sample_width in (1, 2, 4): + raise ValueError("Sample width must be one of: 1, 2 or 4 (bytes)") + + self.sampling_rate = sampling_rate + self.sample_width = sample_width + self.channels = channels + + import pyaudio + self._p = pyaudio.PyAudio() + self.stream = self._p.open(format=self._p.get_format_from_width(self.sample_width), + channels=self.channels, rate=self.sampling_rate, + input=False, output=True) + + def play(self, data): + if self.stream.is_stopped(): + self.stream.start_stream() + + for chunk in self._chunk_data(data): + self.stream.write(chunk) + + self.stream.stop_stream() + + def stop(self): + if not self.stream.is_stopped(): + self.stream.stop_stream() + self.stream.close() + self._p.terminate() + + def _chunk_data(self, data): + # make audio chunks of 100 ms to allow interruption (like ctrl+c) + chunk_size = int((self.sampling_rate * self.sample_width * self.channels) / 10) + start = 0 + while start < len(data): + yield data[start: start + chunk_size] + start += chunk_size + + +def from_file(filename): + """ + Create an `AudioSource` object using the audio file specified by `filename`. + The appropriate :class:`AudioSource` class is guessed from file's extension. + + :Parameters: + + `filename` : + path to an audio file. + + :Returns: + + an `AudioSource` object that reads data from the given file. + """ + + if filename.lower().endswith(".wav"): + return WaveAudioSource(filename) + + raise Exception("Can not create an AudioSource object from '%s'" % (filename)) + + +def player_for(audio_source): + """ + Return a :class:`PyAudioPlayer` that can play data from `audio_source`. + + :Parameters: + + `audio_source` : + an `AudioSource` object. + + :Returns: + + `PyAudioPlayer` that has the same sampling rate, sample width and number of channels + as `audio_source`. + """ + + return PyAudioPlayer(audio_source.get_sampling_rate(), + audio_source.get_sample_width(), + audio_source.get_channels()) diff --git a/libs/auditok/util.py b/libs/auditok/util.py new file mode 100644 index 000000000..9bf9c8cf9 --- /dev/null +++ b/libs/auditok/util.py @@ -0,0 +1,843 @@ +""" +Class summary +============= + +.. autosummary:: + + DataSource + StringDataSource + ADSFactory + ADSFactory.AudioDataSource + ADSFactory.ADSDecorator + ADSFactory.OverlapADS + ADSFactory.LimiterADS + ADSFactory.RecorderADS + DataValidator + AudioEnergyValidator + +""" + +from abc import ABCMeta, abstractmethod +import math +from array import array +from .io import Rewindable, from_file, BufferAudioSource, PyAudioSource +from .exceptions import DuplicateArgument +import sys + +try: + import numpy + _WITH_NUMPY = True +except ImportError as e: + _WITH_NUMPY = False + +try: + from builtins import str + basestring = str +except ImportError as e: + if sys.version_info >= (3, 0): + basestring = str + +__all__ = ["DataSource", "DataValidator", "StringDataSource", "ADSFactory", "AudioEnergyValidator"] + + +class DataSource(): + """ + Base class for objects passed to :func:`auditok.core.StreamTokenizer.tokenize`. + Subclasses should implement a :func:`DataSource.read` method. + """ + __metaclass__ = ABCMeta + + @abstractmethod + def read(self): + """ + Read a piece of data read from this source. + If no more data is available, return None. + """ + + +class DataValidator(): + """ + Base class for a validator object used by :class:`.core.StreamTokenizer` to check + if read data is valid. + Subclasses should implement :func:`is_valid` method. + """ + __metaclass__ = ABCMeta + + @abstractmethod + def is_valid(self, data): + """ + Check whether `data` is valid + """ + + +class StringDataSource(DataSource): + """ + A class that represent a :class:`DataSource` as a string buffer. + Each call to :func:`DataSource.read` returns on character and moves one step forward. + If the end of the buffer is reached, :func:`read` returns None. + + :Parameters: + + `data` : + a basestring object. + + """ + + def __init__(self, data): + + self._data = None + self._current = 0 + self.set_data(data) + + def read(self): + """ + Read one character from buffer. + + :Returns: + + Current character or None if end of buffer is reached + """ + + if self._current >= len(self._data): + return None + self._current += 1 + return self._data[self._current - 1] + + def set_data(self, data): + """ + Set a new data buffer. + + :Parameters: + + `data` : a basestring object + New data buffer. + """ + + if not isinstance(data, basestring): + raise ValueError("data must an instance of basestring") + self._data = data + self._current = 0 + + +class ADSFactory: + """ + Factory class that makes it easy to create an :class:`ADSFactory.AudioDataSource` object that implements + :class:`DataSource` and can therefore be passed to :func:`auditok.core.StreamTokenizer.tokenize`. + + Whether you read audio data from a file, the microphone or a memory buffer, this factory + instantiates and returns the right :class:`ADSFactory.AudioDataSource` object. + + There are many other features you want your :class:`ADSFactory.AudioDataSource` object to have, such as: + memorize all read audio data so that you can rewind and reuse it (especially useful when + reading data from the microphone), read a fixed amount of data (also useful when reading + from the microphone), read overlapping audio frames (often needed when dosing a spectral + analysis of data). + + :func:`ADSFactory.ads` automatically creates and return object with the desired behavior according + to the supplied keyword arguments. + """ + + @staticmethod + def _check_normalize_args(kwargs): + + for k in kwargs: + if not k in ["block_dur", "hop_dur", "block_size", "hop_size", "max_time", "record", + "audio_source", "filename", "data_buffer", "frames_per_buffer", "sampling_rate", + "sample_width", "channels", "sr", "sw", "ch", "asrc", "fn", "fpb", "db", "mt", + "rec", "bd", "hd", "bs", "hs"]: + raise ValueError("Invalid argument: {0}".format(k)) + + if "block_dur" in kwargs and "bd" in kwargs: + raise DuplicateArgument("Either 'block_dur' or 'bd' must be specified, not both") + + if "hop_dur" in kwargs and "hd" in kwargs: + raise DuplicateArgument("Either 'hop_dur' or 'hd' must be specified, not both") + + if "block_size" in kwargs and "bs" in kwargs: + raise DuplicateArgument("Either 'block_size' or 'bs' must be specified, not both") + + if "hop_size" in kwargs and "hs" in kwargs: + raise DuplicateArgument("Either 'hop_size' or 'hs' must be specified, not both") + + if "max_time" in kwargs and "mt" in kwargs: + raise DuplicateArgument("Either 'max_time' or 'mt' must be specified, not both") + + if "audio_source" in kwargs and "asrc" in kwargs: + raise DuplicateArgument("Either 'audio_source' or 'asrc' must be specified, not both") + + if "filename" in kwargs and "fn" in kwargs: + raise DuplicateArgument("Either 'filename' or 'fn' must be specified, not both") + + if "data_buffer" in kwargs and "db" in kwargs: + raise DuplicateArgument("Either 'filename' or 'db' must be specified, not both") + + if "frames_per_buffer" in kwargs and "fbb" in kwargs: + raise DuplicateArgument("Either 'frames_per_buffer' or 'fpb' must be specified, not both") + + if "sampling_rate" in kwargs and "sr" in kwargs: + raise DuplicateArgument("Either 'sampling_rate' or 'sr' must be specified, not both") + + if "sample_width" in kwargs and "sw" in kwargs: + raise DuplicateArgument("Either 'sample_width' or 'sw' must be specified, not both") + + if "channels" in kwargs and "ch" in kwargs: + raise DuplicateArgument("Either 'channels' or 'ch' must be specified, not both") + + if "record" in kwargs and "rec" in kwargs: + raise DuplicateArgument("Either 'record' or 'rec' must be specified, not both") + + kwargs["bd"] = kwargs.pop("block_dur", None) or kwargs.pop("bd", None) + kwargs["hd"] = kwargs.pop("hop_dur", None) or kwargs.pop("hd", None) + kwargs["bs"] = kwargs.pop("block_size", None) or kwargs.pop("bs", None) + kwargs["hs"] = kwargs.pop("hop_size", None) or kwargs.pop("hs", None) + kwargs["mt"] = kwargs.pop("max_time", None) or kwargs.pop("mt", None) + kwargs["asrc"] = kwargs.pop("audio_source", None) or kwargs.pop("asrc", None) + kwargs["fn"] = kwargs.pop("filename", None) or kwargs.pop("fn", None) + kwargs["db"] = kwargs.pop("data_buffer", None) or kwargs.pop("db", None) + + record = kwargs.pop("record", False) + if not record: + record = kwargs.pop("rec", False) + if not isinstance(record, bool): + raise TypeError("'record' must be a boolean") + + kwargs["rec"] = record + + # keep long names for arguments meant for BufferAudioSource and PyAudioSource + if "frames_per_buffer" in kwargs or "fpb" in kwargs: + kwargs["frames_per_buffer"] = kwargs.pop("frames_per_buffer", None) or kwargs.pop("fpb", None) + + if "sampling_rate" in kwargs or "sr" in kwargs: + kwargs["sampling_rate"] = kwargs.pop("sampling_rate", None) or kwargs.pop("sr", None) + + if "sample_width" in kwargs or "sw" in kwargs: + kwargs["sample_width"] = kwargs.pop("sample_width", None) or kwargs.pop("sw", None) + + if "channels" in kwargs or "ch" in kwargs: + kwargs["channels"] = kwargs.pop("channels", None) or kwargs.pop("ch", None) + + @staticmethod + def ads(**kwargs): + """ + Create an return an :class:`ADSFactory.AudioDataSource`. The type and behavior of the object is the result + of the supplied parameters. + + :Parameters: + + *No parameters* : + read audio data from the available built-in microphone with the default parameters. + The returned :class:`ADSFactory.AudioDataSource` encapsulate an :class:`io.PyAudioSource` object and hence + it accepts the next four parameters are passed to use instead of their default values. + + `sampling_rate`, `sr` : *(int)* + number of samples per second. Default = 16000. + + `sample_width`, `sw` : *(int)* + number of bytes per sample (must be in (1, 2, 4)). Default = 2 + + `channels`, `ch` : *(int)* + number of audio channels. Default = 1 (only this value is currently accepted) + + `frames_per_buffer`, `fpb` : *(int)* + number of samples of PyAudio buffer. Default = 1024. + + `audio_source`, `asrc` : an `AudioSource` object + read data from this audio source + + `filename`, `fn` : *(string)* + build an `io.AudioSource` object using this file (currently only wave format is supported) + + `data_buffer`, `db` : *(string)* + build an `io.BufferAudioSource` using data in `data_buffer`. If this keyword is used, + `sampling_rate`, `sample_width` and `channels` are passed to `io.BufferAudioSource` + constructor and used instead of default values. + + `max_time`, `mt` : *(float)* + maximum time (in seconds) to read. Default behavior: read until there is no more data + available. + + `record`, `rec` : *(bool)* + save all read data in cache. Provide a navigable object which boasts a `rewind` method. + Default = False. + + `block_dur`, `bd` : *(float)* + processing block duration in seconds. This represents the quantity of audio data to return + each time the :func:`read` method is invoked. If `block_dur` is 0.025 (i.e. 25 ms) and the sampling + rate is 8000 and the sample width is 2 bytes, :func:`read` returns a buffer of 0.025 * 8000 * 2 = 400 + bytes at most. This parameter will be looked for (and used if available) before `block_size`. + If neither parameter is given, `block_dur` will be set to 0.01 second (i.e. 10 ms) + + `hop_dur`, `hd` : *(float)* + quantity of data to skip from current processing window. if `hop_dur` is supplied then there + will be an overlap of `block_dur` - `hop_dur` between two adjacent blocks. This + parameter will be looked for (and used if available) before `hop_size`. If neither parameter + is given, `hop_dur` will be set to `block_dur` which means that there will be no overlap + between two consecutively read blocks. + + `block_size`, `bs` : *(int)* + number of samples to read each time the `read` method is called. Default: a block size + that represents a window of 10ms, so for a sampling rate of 16000, the default `block_size` + is 160 samples, for a rate of 44100, `block_size` = 441 samples, etc. + + `hop_size`, `hs` : *(int)* + determines the number of overlapping samples between two adjacent read windows. For a + `hop_size` of value *N*, the overlap is `block_size` - *N*. Default : `hop_size` = `block_size`, + means that there is no overlap. + + :Returns: + + An AudioDataSource object that has the desired features. + + :Exampels: + + 1. **Create an AudioDataSource that reads data from the microphone (requires Pyaudio) with default audio parameters:** + + .. code:: python + + from auditok import ADSFactory + ads = ADSFactory.ads() + ads.get_sampling_rate() + 16000 + ads.get_sample_width() + 2 + ads.get_channels() + 1 + + 2. **Create an AudioDataSource that reads data from the microphone with a sampling rate of 48KHz:** + + .. code:: python + + from auditok import ADSFactory + ads = ADSFactory.ads(sr=48000) + ads.get_sampling_rate() + 48000 + + 3. **Create an AudioDataSource that reads data from a wave file:** + + .. code:: python + + import auditok + from auditok import ADSFactory + ads = ADSFactory.ads(fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) + ads.get_sampling_rate() + 44100 + ads.get_sample_width() + 2 + ads.get_channels() + 1 + + 4. **Define size of read blocks as 20 ms** + + .. code:: python + + import auditok + from auditok import ADSFactory + ''' + we know samling rate for previous file is 44100 samples/second + so 10 ms are equivalent to 441 samples and 20 ms to 882 + ''' + block_size = 882 + ads = ADSFactory.ads(bs = 882, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) + ads.open() + # read one block + data = ads.read() + ads.close() + len(data) + 1764 + assert len(data) == ads.get_sample_width() * block_size + + 5. **Define block size as a duration (use block_dur or bd):** + + .. code:: python + + import auditok + from auditok import ADSFactory + dur = 0.25 # second + ads = ADSFactory.ads(bd = dur, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) + ''' + we know samling rate for previous file is 44100 samples/second + for a block duration of 250 ms, block size should be 0.25 * 44100 = 11025 + ''' + ads.get_block_size() + 11025 + assert ads.get_block_size() == int(0.25 * 44100) + ads.open() + # read one block + data = ads.read() + ads.close() + len(data) + 22050 + assert len(data) == ads.get_sample_width() * ads.get_block_size() + + 6. **Read overlapping blocks (one of hope_size, hs, hop_dur or hd > 0):** + + For better readability we'd better use :class:`auditok.io.BufferAudioSource` with a string buffer: + + .. code:: python + + import auditok + from auditok import ADSFactory + ''' + we supply a data beffer instead of a file (keyword 'bata_buffer' or 'db') + sr : sampling rate = 16 samples/sec + sw : sample width = 1 byte + ch : channels = 1 + ''' + buffer = "abcdefghijklmnop" # 16 bytes = 1 second of data + bd = 0.250 # block duration = 250 ms = 4 bytes + hd = 0.125 # hop duration = 125 ms = 2 bytes + ads = ADSFactory.ads(db = "abcdefghijklmnop", bd = bd, hd = hd, sr = 16, sw = 1, ch = 1) + ads.open() + ads.read() + 'abcd' + ads.read() + 'cdef' + ads.read() + 'efgh' + ads.read() + 'ghij' + data = ads.read() + assert data == 'ijkl' + + 7. **Limit amount of read data (use max_time or mt):** + + .. code:: python + + ''' + We know audio file is larger than 2.25 seconds + We want to read up to 2.25 seconds of audio data + ''' + ads = ADSFactory.ads(mt = 2.25, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) + ads.open() + data = [] + while True: + d = ads.read() + if d is None: + break + data.append(d) + + ads.close() + data = b''.join(data) + assert len(data) == int(ads.get_sampling_rate() * 2.25 * ads.get_sample_width() * ads.get_channels()) + """ + + # copy user's dicionary (shallow copy) + kwargs = kwargs.copy() + + # check and normalize keyword arguments + ADSFactory._check_normalize_args(kwargs) + + block_dur = kwargs.pop("bd") + hop_dur = kwargs.pop("hd") + block_size = kwargs.pop("bs") + hop_size = kwargs.pop("hs") + max_time = kwargs.pop("mt") + audio_source = kwargs.pop("asrc") + filename = kwargs.pop("fn") + data_buffer = kwargs.pop("db") + record = kwargs.pop("rec") + + # Case 1: an audio source is supplied + if audio_source is not None: + if (filename, data_buffer) != (None, None): + raise Warning("You should provide one of 'audio_source', 'filename' or 'data_buffer'\ + keyword parameters. 'audio_source' will be used") + + # Case 2: a file name is supplied + elif filename is not None: + if data_buffer is not None: + raise Warning("You should provide one of 'filename' or 'data_buffer'\ + keyword parameters. 'filename' will be used") + audio_source = from_file(filename) + + # Case 3: a data_buffer is supplied + elif data_buffer is not None: + audio_source = BufferAudioSource(data_buffer=data_buffer, **kwargs) + + # Case 4: try to access native audio input + else: + audio_source = PyAudioSource(**kwargs) + + if block_dur is not None: + if block_size is not None: + raise DuplicateArgument("Either 'block_dur' or 'block_size' can be specified, not both") + else: + block_size = int(audio_source.get_sampling_rate() * block_dur) + elif block_size is None: + # Set default block_size to 10 ms + block_size = int(audio_source.get_sampling_rate() / 100) + + # Instantiate base AudioDataSource + ads = ADSFactory.AudioDataSource(audio_source=audio_source, block_size=block_size) + + # Limit data to be read + if max_time is not None: + ads = ADSFactory.LimiterADS(ads=ads, max_time=max_time) + + # Record, rewind and reuse data + if record: + ads = ADSFactory.RecorderADS(ads=ads) + + # Read overlapping blocks of data + if hop_dur is not None: + if hop_size is not None: + raise DuplicateArgument("Either 'hop_dur' or 'hop_size' can be specified, not both") + else: + hop_size = int(audio_source.get_sampling_rate() * hop_dur) + + if hop_size is not None: + if hop_size <= 0 or hop_size > block_size: + raise ValueError("hop_size must be > 0 and <= block_size") + if hop_size < block_size: + ads = ADSFactory.OverlapADS(ads=ads, hop_size=hop_size) + + return ads + + class AudioDataSource(DataSource): + """ + Base class for AudioDataSource objects. + It inherits from DataSource and encapsulates an AudioSource object. + """ + + def __init__(self, audio_source, block_size): + + self.audio_source = audio_source + self.block_size = block_size + + def get_block_size(self): + return self.block_size + + def set_block_size(self, size): + self.block_size = size + + def get_audio_source(self): + return self.audio_source + + def set_audio_source(self, audio_source): + self.audio_source = audio_source + + def open(self): + self.audio_source.open() + + def close(self): + self.audio_source.close() + + def is_open(self): + return self.audio_source.is_open() + + def get_sampling_rate(self): + return self.audio_source.get_sampling_rate() + + def get_sample_width(self): + return self.audio_source.get_sample_width() + + def get_channels(self): + return self.audio_source.get_channels() + + def rewind(self): + if isinstance(self.audio_source, Rewindable): + self.audio_source.rewind() + else: + raise Exception("Audio source is not rewindable") + + def is_rewindable(self): + return isinstance(self.audio_source, Rewindable) + + def read(self): + return self.audio_source.read(self.block_size) + + class ADSDecorator(AudioDataSource): + """ + Base decorator class for AudioDataSource objects. + """ + __metaclass__ = ABCMeta + + def __init__(self, ads): + self.ads = ads + + self.get_block_size = self.ads.get_block_size + self.set_block_size = self.ads.set_block_size + self.get_audio_source = self.ads.get_audio_source + self.open = self.ads.open + self.close = self.ads.close + self.is_open = self.ads.is_open + self.get_sampling_rate = self.ads.get_sampling_rate + self.get_sample_width = self.ads.get_sample_width + self.get_channels = self.ads.get_channels + + def is_rewindable(self): + return self.ads.is_rewindable + + def rewind(self): + self.ads.rewind() + self._reinit() + + def set_audio_source(self, audio_source): + self.ads.set_audio_source(audio_source) + self._reinit() + + def open(self): + if not self.ads.is_open(): + self.ads.open() + self._reinit() + + @abstractmethod + def _reinit(self): + pass + + class OverlapADS(ADSDecorator): + """ + A class for AudioDataSource objects that can read and return overlapping + audio frames + """ + + def __init__(self, ads, hop_size): + ADSFactory.ADSDecorator.__init__(self, ads) + + if hop_size <= 0 or hop_size > self.get_block_size(): + raise ValueError("hop_size must be either 'None' or \ + between 1 and block_size (both inclusive)") + self.hop_size = hop_size + self._actual_block_size = self.get_block_size() + self._reinit() + + def _get_block_size(): + return self._actual_block_size + + def _read_first_block(self): + # For the first call, we need an entire block of size 'block_size' + block = self.ads.read() + if block is None: + return None + + # Keep a slice of data in cache and append it in the next call + if len(block) > self._hop_size_bytes: + self._cache = block[self._hop_size_bytes:] + + # Up from the next call, we will use '_read_next_blocks' + # and we only read 'hop_size' + self.ads.set_block_size(self.hop_size) + self.read = self._read_next_blocks + + return block + + def _read_next_blocks(self): + block = self.ads.read() + if block is None: + return None + + # Append block to cache data to ensure overlap + block = self._cache + block + # Keep a slice of data in cache only if we have a full length block + # if we don't that means that this is the last block + if len(block) == self._block_size_bytes: + self._cache = block[self._hop_size_bytes:] + else: + self._cache = None + + return block + + def read(self): + pass + + def _reinit(self): + self._cache = None + self.ads.set_block_size(self._actual_block_size) + self._hop_size_bytes = self.hop_size * \ + self.get_sample_width() * \ + self.get_channels() + self._block_size_bytes = self.get_block_size() * \ + self.get_sample_width() * \ + self.get_channels() + self.read = self._read_first_block + + class LimiterADS(ADSDecorator): + """ + A class for AudioDataSource objects that can read a fixed amount of data. + This can be useful when reading data from the microphone or from large audio files. + """ + + def __init__(self, ads, max_time): + ADSFactory.ADSDecorator.__init__(self, ads) + + self.max_time = max_time + self._reinit() + + def read(self): + if self._total_read_bytes >= self._max_read_bytes: + return None + block = self.ads.read() + if block is None: + return None + self._total_read_bytes += len(block) + + if self._total_read_bytes >= self._max_read_bytes: + self.close() + + return block + + def _reinit(self): + self._max_read_bytes = int(self.max_time * self.get_sampling_rate()) * \ + self.get_sample_width() * \ + self.get_channels() + self._total_read_bytes = 0 + + class RecorderADS(ADSDecorator): + """ + A class for AudioDataSource objects that can record all audio data they read, + with a rewind facility. + """ + + def __init__(self, ads): + ADSFactory.ADSDecorator.__init__(self, ads) + + self._reinit() + + def read(self): + pass + + def _read_and_rec(self): + # Read and save read data + block = self.ads.read() + if block is not None: + self._cache.append(block) + + return block + + def _read_simple(self): + # Read without recording + return self.ads.read() + + def rewind(self): + if self._record: + # If has been recording, create a new BufferAudioSource + # from recorded data + dbuffer = self._concatenate(self._cache) + asource = BufferAudioSource(dbuffer, self.get_sampling_rate(), + self.get_sample_width(), + self.get_channels()) + + self.set_audio_source(asource) + self.open() + self._cache = [] + self._record = False + self.read = self._read_simple + + else: + self.ads.rewind() + if not self.is_open(): + self.open() + + def is_rewindable(self): + return True + + def _reinit(self): + # when audio_source is replaced, start recording again + self._record = True + self._cache = [] + self.read = self._read_and_rec + + def _concatenate(self, data): + try: + # should always work for python 2 + # work for python 3 ONLY if data is a list (or an iterator) + # whose each element is a 'bytes' objects + return b''.join(data) + except TypeError: + # work for 'str' in python 2 and python 3 + return ''.join(data) + + +class AudioEnergyValidator(DataValidator): + """ + The most basic auditok audio frame validator. + This validator computes the log energy of an input audio frame + and return True if the result is >= a given threshold, False + otherwise. + + :Parameters: + + `sample_width` : *(int)* + Number of bytes of one audio sample. This is used to convert data from `basestring` or `Bytes` to + an array of floats. + + `energy_threshold` : *(float)* + A threshold used to check whether an input data buffer is valid. + """ + + if _WITH_NUMPY: + _formats = {1: numpy.int8, 2: numpy.int16, 4: numpy.int32} + + @staticmethod + def _convert(signal, sample_width): + return numpy.array(numpy.frombuffer(signal, dtype=AudioEnergyValidator._formats[sample_width]), + dtype=numpy.float64) + + @staticmethod + def _signal_energy(signal): + return float(numpy.dot(signal, signal)) / len(signal) + + @staticmethod + def _signal_log_energy(signal): + energy = AudioEnergyValidator._signal_energy(signal) + if energy <= 0: + return -200 + return 10. * numpy.log10(energy) + + else: + _formats = {1: 'b', 2: 'h', 4: 'i'} + + @staticmethod + def _convert(signal, sample_width): + return array("d", array(AudioEnergyValidator._formats[sample_width], signal)) + + @staticmethod + def _signal_energy(signal): + energy = 0. + for a in signal: + energy += a * a + return energy / len(signal) + + @staticmethod + def _signal_log_energy(signal): + energy = AudioEnergyValidator._signal_energy(signal) + if energy <= 0: + return -200 + return 10. * math.log10(energy) + + def __init__(self, sample_width, energy_threshold=45): + self.sample_width = sample_width + self._energy_threshold = energy_threshold + + def is_valid(self, data): + """ + Check if data is valid. Audio data will be converted into an array (of + signed values) of which the log energy is computed. Log energy is computed + as follows: + + .. code:: python + + arr = AudioEnergyValidator._convert(signal, sample_width) + energy = float(numpy.dot(arr, arr)) / len(arr) + log_energy = 10. * numpy.log10(energy) + + + :Parameters: + + `data` : either a *string* or a *Bytes* buffer + `data` is converted into a numerical array using the `sample_width` + given in the constructor. + + :Returns: + + True if `log_energy` >= `energy_threshold`, False otherwise. + """ + + signal = AudioEnergyValidator._convert(data, self.sample_width) + return AudioEnergyValidator._signal_log_energy(signal) >= self._energy_threshold + + def get_energy_threshold(self): + return self._energy_threshold + + def set_energy_threshold(self, threshold): + self._energy_threshold = threshold |