bazarr/libs/auditok/core.py

1459 lines
52 KiB
Python
Raw Normal View History

"""
.. autosummary::
:toctree: generated/
load
split
AudioRegion
StreamTokenizer
"""
import os
import math
from .util import AudioReader, DataValidator, AudioEnergyValidator
from .io import check_audio_data, to_file, player_for, get_audio_source
from .exceptions import TooSamllBlockDuration
try:
from . import signal_numpy as signal
except ImportError:
from . import signal
__all__ = ["load", "split", "AudioRegion", "StreamTokenizer"]
DEFAULT_ANALYSIS_WINDOW = 0.05
DEFAULT_ENERGY_THRESHOLD = 50
_EPSILON = 1e-10
def load(input, skip=0, max_read=None, **kwargs):
"""Load audio data from a source and return it as an :class:`AudioRegion`.
Parameters
----------
input : None, str, bytes, AudioSource
source to read audio data from. If `str`, it should be a path to a
valid audio file. If `bytes`, it is used as raw audio data. If it is
"-", raw data will be read from stdin. If None, read audio data from
the microphone using PyAudio. If of type `bytes` or is a path to a
raw audio file then `sampling_rate`, `sample_width` and `channels`
parameters (or their alias) are required. If it's an
:class:`AudioSource` object it's used directly to read data.
skip : float, default: 0
amount, in seconds, of audio data to skip from source. If read from
a microphone, `skip` must be 0, otherwise a `ValueError` is raised.
max_read : float, default: None
amount, in seconds, of audio data to read from source. If read from
microphone, `max_read` should not be None, otherwise a `ValueError` is
raised.
audio_format, fmt : str
type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only
be used if `input` is a string path to an audio file. If not given,
audio type will be guessed from file name extension or from file
header.
sampling_rate, sr : int
sampling rate of audio data. Required if `input` is a raw audio file,
a `bytes` object or None (i.e., read from microphone).
sample_width, sw : int
number of bytes used to encode one audio sample, typically 1, 2 or 4.
Required for raw data, see `sampling_rate`.
channels, ch : int
number of channels of audio data. Required for raw data, see
`sampling_rate`.
large_file : bool, default: False
If True, AND if `input` is a path to a *wav* of a *raw* audio file
(and **only** these two formats) then audio file is not fully loaded to
memory in order to create the region (but the portion of data needed to
create the region is of course loaded to memory). Set to True if
`max_read` is significantly smaller then the size of a large audio file
that shouldn't be entirely loaded to memory.
Returns
-------
region: AudioRegion
Raises
------
ValueError
raised if `input` is None (i.e., read data from microphone) and `skip`
!= 0 or `input` is None `max_read` is None (meaning that when reading
from the microphone, no data should be skipped, and maximum amount of
data to read should be explicitly provided).
"""
return AudioRegion.load(input, skip, max_read, **kwargs)
def split(
input,
min_dur=0.2,
max_dur=5,
max_silence=0.3,
drop_trailing_silence=False,
strict_min_dur=False,
**kwargs
):
"""
Split audio data and return a generator of AudioRegions
Parameters
----------
input : str, bytes, AudioSource, AudioReader, AudioRegion or None
input audio data. If str, it should be a path to an existing audio file.
"-" is interpreted as standard input. If bytes, input is considered as
raw audio data. If None, read audio from microphone.
Every object that is not an `AudioReader` will be transformed into an
`AudioReader` before processing. If it is an `str` that refers to a raw
audio file, `bytes` or None, audio parameters should be provided using
kwargs (i.e., `samplig_rate`, `sample_width` and `channels` or their
alias).
If `input` is str then audio format will be guessed from file extension.
`audio_format` (alias `fmt`) kwarg can also be given to specify audio
format explicitly. If none of these options is available, rely on
backend (currently only pydub is supported) to load data.
min_dur : float, default: 0.2
minimun duration in seconds of a detected audio event. By using large
values for `min_dur`, very short audio events (e.g., very short 1-word
utterances like 'yes' or 'no') can be mis detected. Using very short
values might result in a high number of short, unuseful audio events.
max_dur : float, default: 5
maximum duration in seconds of a detected audio event. If an audio event
lasts more than `max_dur` it will be truncated. If the continuation of a
truncated audio event is shorter than `min_dur` then this continuation
is accepted as a valid audio event if `strict_min_dur` is False.
Otherwise it is rejected.
max_silence : float, default: 0.3
maximum duration of continuous silence within an audio event. There
might be many silent gaps of this duration within one audio event. If
the continuous silence happens at the end of the event than it's kept as
part of the event if `drop_trailing_silence` is False (default).
drop_trailing_silence : bool, default: False
Whether to remove trailing silence from detected events. To avoid abrupt
cuts in speech, trailing silence should be kept, therefore this
parameter should be False.
strict_min_dur : bool, default: False
strict minimum duration. Do not accept an audio event if it is shorter
than `min_dur` even if it is contiguous to the latest valid event. This
happens if the the latest detected event had reached `max_dur`.
Other Parameters
----------------
analysis_window, aw : float, default: 0.05 (50 ms)
duration of analysis window in seconds. A value between 0.01 (10 ms) and
0.1 (100 ms) should be good for most use-cases.
audio_format, fmt : str
type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be
used if `input` is a string path to an audio file. If not given, audio
type will be guessed from file name extension or from file header.
sampling_rate, sr : int
sampling rate of audio data. Required if `input` is a raw audio file, is
a bytes object or None (i.e., read from microphone).
sample_width, sw : int
number of bytes used to encode one audio sample, typically 1, 2 or 4.
Required for raw data, see `sampling_rate`.
channels, ch : int
number of channels of audio data. Required for raw data, see
`sampling_rate`.
use_channel, uc : {None, "mix"} or int
which channel to use for split if `input` has multiple audio channels.
Regardless of which channel is used for splitting, returned audio events
contain data from *all* channels, just as `input`.
The following values are accepted:
- None (alias "any"): accept audio activity from any channel, even if
other channels are silent. This is the default behavior.
- "mix" ("avg" or "average"): mix down all channels (i.e. compute
average channel) and split the resulting channel.
- int (0 <=, > `channels`): use one channel, specified by integer id,
for split.
large_file : bool, default: False
If True, AND if `input` is a path to a *wav* of a *raw* audio file
(and only these two formats) then audio data is lazily loaded to memory
(i.e., one analysis window a time). Otherwise the whole file is loaded
to memory before split. Set to True if the size of the file is larger
than available memory.
max_read, mr : float, default: None, read until end of stream
maximum data to read from source in seconds.
validator, val : callable, DataValidator
custom data validator. If `None` (default), an `AudioEnergyValidor` is
used with the given energy threshold. Can be a callable or an instance
of `DataValidator` that implements `is_valid`. In either case, it'll be
called with with a window of audio data as the first parameter.
energy_threshold, eth : float, default: 50
energy threshold for audio activity detection. Audio regions that have
enough windows of with a signal energy equal to or above this threshold
are considered valid audio events. Here we are referring to this amount
as the energy of the signal but to be more accurate, it is the log
energy of computed as: `20 * log10(sqrt(dot(x, x) / len(x)))` (see
:class:`AudioEnergyValidator` and
:func:`calculate_energy_single_channel`). If `validator` is given, this
argument is ignored.
Yields
------
AudioRegion
a generator of detected :class:`AudioRegion` s.
"""
if min_dur <= 0:
raise ValueError("'min_dur' ({}) must be > 0".format(min_dur))
if max_dur <= 0:
raise ValueError("'max_dur' ({}) must be > 0".format(max_dur))
if max_silence < 0:
raise ValueError("'max_silence' ({}) must be >= 0".format(max_silence))
if isinstance(input, AudioReader):
source = input
analysis_window = source.block_dur
else:
analysis_window = kwargs.get(
"analysis_window", kwargs.get("aw", DEFAULT_ANALYSIS_WINDOW)
)
if analysis_window <= 0:
raise ValueError(
"'analysis_window' ({}) must be > 0".format(analysis_window)
)
params = kwargs.copy()
params["max_read"] = params.get("max_read", params.get("mr"))
params["audio_format"] = params.get("audio_format", params.get("fmt"))
if isinstance(input, AudioRegion):
params["sampling_rate"] = input.sr
params["sample_width"] = input.sw
params["channels"] = input.ch
input = bytes(input)
try:
source = AudioReader(input, block_dur=analysis_window, **params)
except TooSamllBlockDuration as exc:
err_msg = "Too small 'analysis_windows' ({0}) for sampling rate "
err_msg += "({1}). Analysis windows should at least be 1/{1} to "
err_msg += "cover one single data sample"
raise ValueError(err_msg.format(exc.block_dur, exc.sampling_rate))
validator = kwargs.get("validator", kwargs.get("val"))
if validator is None:
energy_threshold = kwargs.get(
"energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
)
use_channel = kwargs.get("use_channel", kwargs.get("uc"))
validator = AudioEnergyValidator(
energy_threshold, source.sw, source.ch, use_channel=use_channel
)
mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0
if strict_min_dur:
mode |= StreamTokenizer.STRICT_MIN_LENGTH
min_length = _duration_to_nb_windows(min_dur, analysis_window, math.ceil)
max_length = _duration_to_nb_windows(
max_dur, analysis_window, math.floor, _EPSILON
)
max_continuous_silence = _duration_to_nb_windows(
max_silence, analysis_window, math.floor, _EPSILON
)
err_msg = "({0} sec.) results in {1} analysis window(s) "
err_msg += "({1} == {6}({0} / {2})) which is {5} the number "
err_msg += "of analysis window(s) for 'max_dur' ({3} == floor({4} / {2}))"
if min_length > max_length:
err_msg = "'min_dur' " + err_msg
raise ValueError(
err_msg.format(
min_dur,
min_length,
analysis_window,
max_length,
max_dur,
"higher than",
"ceil",
)
)
if max_continuous_silence >= max_length:
err_msg = "'max_silence' " + err_msg
raise ValueError(
err_msg.format(
max_silence,
max_continuous_silence,
analysis_window,
max_length,
max_dur,
"higher or equal to",
"floor",
)
)
tokenizer = StreamTokenizer(
validator, min_length, max_length, max_continuous_silence, mode=mode
)
source.open()
token_gen = tokenizer.tokenize(source, generator=True)
region_gen = (
_make_audio_region(
token[0],
token[1],
source.block_dur,
source.sr,
source.sw,
source.ch,
)
for token in token_gen
)
return region_gen
def _duration_to_nb_windows(
duration, analysis_window, round_fn=round, epsilon=0
):
"""
Converts a given duration into a positive integer of analysis windows.
if `duration / analysis_window` is not an integer, the result will be
rounded to the closest bigger integer. If `duration == 0`, returns `0`.
If `duration < analysis_window`, returns 1.
`duration` and `analysis_window` can be in seconds or milliseconds but
must be in the same unit.
Parameters
----------
duration : float
a given duration in seconds or ms.
analysis_window: float
size of analysis window, in the same unit as `duration`.
round_fn : callable
function called to round the result. Default: `round`.
epsilon : float
small value to add to the division result before rounding.
E.g., `0.3 / 0.1 = 2.9999999999999996`, when called with
`round_fn=math.floor` returns `2` instead of `3`. Adding a small value
to `0.3 / 0.1` avoids this error.
Returns
-------
nb_windows : int
minimum number of `analysis_window`'s to cover `durartion`. That means
that `analysis_window * nb_windows >= duration`.
"""
if duration < 0 or analysis_window <= 0:
err_msg = "'duration' ({}) must be >= 0 and 'analysis_window' ({}) > 0"
raise ValueError(err_msg.format(duration, analysis_window))
if duration == 0:
return 0
return int(round_fn(duration / analysis_window + epsilon))
def _make_audio_region(
data_frames,
start_frame,
frame_duration,
sampling_rate,
sample_width,
channels,
):
"""
Helper function to create an `AudioRegion` from parameters returned by
tokenization object. It takes care of setting up region `start` and `end`
in metadata.
Parameters
----------
frame_duration: float
duration of analysis window in seconds
start_frame : int
index of the fisrt analysis window
samling_rate : int
sampling rate of audio data
sample_width : int
number of bytes of one audio sample
channels : int
number of channels of audio data
Returns
-------
audio_region : AudioRegion
AudioRegion whose start time is calculeted as:
`1000 * start_frame * frame_duration`
"""
start = start_frame * frame_duration
data = b"".join(data_frames)
duration = len(data) / (sampling_rate * sample_width * channels)
meta = {"start": start, "end": start + duration}
return AudioRegion(data, sampling_rate, sample_width, channels, meta)
def _read_chunks_online(max_read, **kwargs):
"""
Helper function to read audio data from an online blocking source
(i.e., microphone). Used to build an `AudioRegion` and can intercept
KeyboardInterrupt so that reading stops as soon as this exception is
raised. Makes building `AudioRegion`s on [i]python sessions and jupyter
notebooks more user friendly.
Parameters
----------
max_read : float
maximum amount of data to read in seconds.
kwargs :
audio parameters (sampling_rate, sample_width and channels).
See also
--------
`AudioRegion.build`
"""
reader = AudioReader(None, block_dur=0.5, max_read=max_read, **kwargs)
reader.open()
data = []
try:
while True:
frame = reader.read()
if frame is None:
break
data.append(frame)
except KeyboardInterrupt:
# Stop data acquisition from microphone when pressing
# Ctrl+C on a [i]python session or a notebook
pass
reader.close()
return (
b"".join(data),
reader.sampling_rate,
reader.sample_width,
reader.channels,
)
def _read_offline(input, skip=0, max_read=None, **kwargs):
"""
Helper function to read audio data from an offline (i.e., file). Used to
build `AudioRegion`s.
Parameters
----------
input : str, bytes
path to audio file (if str), or a bytes object representing raw audio
data.
skip : float, default 0
amount of data to skip from the begining of audio source.
max_read : float, default: None
maximum amount of audio data to read. Default: None, means read until
end of stream.
kwargs :
audio parameters (sampling_rate, sample_width and channels).
See also
--------
`AudioRegion.build`
"""
audio_source = get_audio_source(input, **kwargs)
audio_source.open()
if skip is not None and skip > 0:
skip_samples = round(skip * audio_source.sampling_rate)
audio_source.read(skip_samples)
if max_read is not None:
if max_read < 0:
max_read = None
else:
max_read = round(max_read * audio_source.sampling_rate)
data = audio_source.read(max_read)
audio_source.close()
return (
data,
audio_source.sampling_rate,
audio_source.sample_width,
audio_source.channels,
)
def _check_convert_index(index, types, err_msg):
if not isinstance(index, slice) or index.step is not None:
raise TypeError(err_msg)
start = index.start if index.start is not None else 0
stop = index.stop
for index in (start, stop):
if index is not None and not isinstance(index, types):
raise TypeError(err_msg)
return start, stop
class _SecondsView:
"""A class to create a view of `AudioRegion` that can be sliced using
indices in seconds.
"""
def __init__(self, region):
self._region = region
def __getitem__(self, index):
err_msg = "Slicing AudioRegion by seconds requires indices of type "
err_msg += "'int' or 'float' without a step (e.g. region.sec[7.5:10])"
start_s, stop_s = _check_convert_index(index, (int, float), err_msg)
sr = self._region.sampling_rate
start_sample = int(start_s * sr)
stop_sample = None if stop_s is None else round(stop_s * sr)
return self._region[start_sample:stop_sample]
@property
def len(self):
"""
Return region duration in seconds.
"""
return self._region.duration
class _MillisView(_SecondsView):
"""A class to create a view of `AudioRegion` that can be sliced using
indices in milliseconds.
"""
def __getitem__(self, index):
err_msg = (
"Slicing AudioRegion by milliseconds requires indices of type "
)
err_msg += "'int' without a step (e.g. region.sec[500:1500])"
start_ms, stop_ms = _check_convert_index(index, (int), err_msg)
start_sec = start_ms / 1000
stop_sec = None if stop_ms is None else stop_ms / 1000
index = slice(start_sec, stop_sec)
return super(_MillisView, self).__getitem__(index)
def __len__(self):
"""
Return region duration in milliseconds.
"""
return round(self._region.duration * 1000)
@property
def len(self):
"""
Return region duration in milliseconds.
"""
return len(self)
class _AudioRegionMetadata(dict):
"""A class to store `AudioRegion`'s metadata."""
def __getattr__(self, name):
if name in self:
return self[name]
else:
err_msg = "AudioRegion metadata has no entry '{}'"
raise AttributeError(err_msg.format(name))
def __setattr__(self, name, value):
self[name] = value
def __str__(self):
return "\n".join("{}: {}".format(k, v) for k, v in self.items())
def __repr__(self):
return str(self)
class AudioRegion(object):
"""
AudioRegion encapsulates raw audio data and provides an interface to
perform simple operations on it. Use `AudioRegion.load` to build an
`AudioRegion` from different types of objects.
Parameters
----------
data : bytes
raw audio data as a bytes object
sampling_rate : int
sampling rate of audio data
sample_width : int
number of bytes of one audio sample
channels : int
number of channels of audio data
meta : dict, default: None
any collection of <key:value> elements used to build metadata for
this `AudioRegion`. Meta data can be accessed via `region.meta.key`
if `key` is a valid python attribute name, or via `region.meta[key]`
if not. Note that the :func:`split` function (or the
:meth:`AudioRegion.split` method) returns `AudioRegions` with a ``start``
and a ``stop`` meta values that indicate the location in seconds of the
region in original audio data.
See also
--------
AudioRegion.load
"""
def __init__(self, data, sampling_rate, sample_width, channels, meta=None):
check_audio_data(data, sample_width, channels)
self._data = data
self._sampling_rate = sampling_rate
self._sample_width = sample_width
self._channels = channels
self._samples = None
self.splitp = self.split_and_plot
if meta is not None:
self._meta = _AudioRegionMetadata(meta)
else:
self._meta = None
self._seconds_view = _SecondsView(self)
self.sec = self.seconds
self.s = self.seconds
self._millis_view = _MillisView(self)
self.ms = self.millis
@property
def meta(self):
return self._meta
@meta.setter
def meta(self, new_meta):
"""Meta data of audio region."""
self._meta = _AudioRegionMetadata(new_meta)
@classmethod
def load(cls, input, skip=0, max_read=None, **kwargs):
"""
Create an `AudioRegion` by loading data from `input`. See :func:`load`
for parameters descripion.
Returns
-------
region: AudioRegion
Raises
------
ValueError
raised if `input` is None and `skip` != 0 or `max_read` is None.
"""
if input is None:
if skip > 0:
raise ValueError(
"'skip' should be 0 when reading from microphone"
)
if max_read is None or max_read < 0:
raise ValueError(
"'max_read' should not be None when reading from "
"microphone"
)
data, sampling_rate, sample_width, channels = _read_chunks_online(
max_read, **kwargs
)
else:
data, sampling_rate, sample_width, channels = _read_offline(
input, skip=skip, max_read=max_read, **kwargs
)
return cls(data, sampling_rate, sample_width, channels)
@property
def seconds(self):
"""
A view to slice audio region by seconds (using ``region.seconds[start:end]``).
"""
return self._seconds_view
@property
def millis(self):
"""A view to slice audio region by milliseconds (using ``region.millis[start:end]``)."""
return self._millis_view
@property
def duration(self):
"""
Returns region duration in seconds.
"""
return len(self._data) / (
self.sampling_rate * self.sample_width * self.channels
)
@property
def sampling_rate(self):
"""Samling rate of audio data."""
return self._sampling_rate
@property
def sr(self):
"""Samling rate of audio data, alias for `sampling_rate`."""
return self._sampling_rate
@property
def sample_width(self):
"""Number of bytes per sample, one channel considered."""
return self._sample_width
@property
def sw(self):
"""Number of bytes per sample, alias for `sampling_rate`."""
return self._sample_width
@property
def channels(self):
"""Number of channels of audio data."""
return self._channels
@property
def ch(self):
"""Number of channels of audio data, alias for `channels`."""
return self._channels
def play(self, progress_bar=False, player=None, **progress_bar_kwargs):
"""
Play audio region.
Parameters
----------
progress_bar : bool, default: False
whether to use a progress bar while playing audio. Default: False.
`progress_bar` requires `tqdm`, if not installed, no progress bar
will be shown.
player : AudioPalyer, default: None
audio player to use. if None (default), use `player_for()`
to get a new audio player.
progress_bar_kwargs : kwargs
keyword arguments to pass to `tqdm` progress_bar builder (e.g.,
use `leave=False` to clean up the screen when play finishes).
"""
if player is None:
player = player_for(self)
player.play(
self._data, progress_bar=progress_bar, **progress_bar_kwargs
)
def save(self, file, audio_format=None, exists_ok=True, **audio_parameters):
"""
Save audio region to file.
Parameters
----------
file : str
path to output audio file. May contain `{duration}` placeholder
as well as any place holder that this region's metadata might
contain (e.g., regions returned by `split` contain metadata with
`start` and `end` attributes that can be used to build output file
name as `{meta.start}` and `{meta.end}`. See examples using
placeholders with formatting.
audio_format : str, default: None
format used to save audio data. If None (default), format is guessed
from file name's extension. If file name has no extension, audio
data is saved as a raw (headerless) audio file.
exists_ok : bool, default: True
If True, overwrite `file` if a file with the same name exists.
If False, raise an `IOError` if `file` exists.
audio_parameters: dict
any keyword arguments to be passed to audio saving backend.
Returns
-------
file: str
name of output file with replaced placehoders.
Raises
IOError if `file` exists and `exists_ok` is False.
Examples
--------
>>> region = AudioRegion(b'\\0' * 2 * 24000,
>>> sampling_rate=16000,
>>> sample_width=2,
>>> channels=1)
>>> region.meta.start = 2.25
>>> region.meta.end = 2.25 + region.duration
>>> region.save('audio_{meta.start}-{meta.end}.wav')
>>> audio_2.25-3.75.wav
>>> region.save('region_{meta.start:.3f}_{duration:.3f}.wav')
audio_2.250_1.500.wav
"""
if isinstance(file, str):
file = file.format(duration=self.duration, meta=self.meta)
if not exists_ok and os.path.exists(file):
raise FileExistsError("file '{file}' exists".format(file=file))
to_file(
self._data,
file,
audio_format,
sr=self.sr,
sw=self.sw,
ch=self.ch,
audio_parameters=audio_parameters,
)
return file
def split(
self,
min_dur=0.2,
max_dur=5,
max_silence=0.3,
drop_trailing_silence=False,
strict_min_dur=False,
**kwargs
):
"""Split audio region. See :func:`auditok.split()` for a comprehensive
description of split parameters.
See Also :meth:`AudioRegio.split_and_plot`.
"""
if kwargs.get("max_read", kwargs.get("mr")) is not None:
warn_msg = "'max_read' (or 'mr') should not be used with "
warn_msg += "AudioRegion.split_and_plot(). You should rather "
warn_msg += "slice audio region before calling this method"
raise RuntimeWarning(warn_msg)
return split(
self,
min_dur=min_dur,
max_dur=max_dur,
max_silence=max_silence,
drop_trailing_silence=drop_trailing_silence,
strict_min_dur=strict_min_dur,
**kwargs
)
def plot(
self,
scale_signal=True,
show=True,
figsize=None,
save_as=None,
dpi=120,
theme="auditok",
):
"""Plot audio region, one sub-plot for each channel.
Parameters
----------
scale_signal : bool, default: True
if true, scale signal by subtracting its mean and dividing by its
standard deviation before plotting.
show : bool
whether to show plotted signal right after the call.
figsize : tuple, default: None
width and height of the figure to pass to `matplotlib`.
save_as : str, default None.
if provided, also save plot to file.
dpi : int, default: 120
plot dpi to pass to `matplotlib`.
theme : str or dict, default: "auditok"
plot theme to use. Currently only "auditok" theme is implemented. To
provide you own them see :attr:`auditok.plotting.AUDITOK_PLOT_THEME`.
"""
try:
from auditok.plotting import plot
plot(
self,
scale_signal=scale_signal,
show=show,
figsize=figsize,
save_as=save_as,
dpi=dpi,
theme=theme,
)
except ImportError:
raise RuntimeWarning("Plotting requires matplotlib")
def split_and_plot(
self,
min_dur=0.2,
max_dur=5,
max_silence=0.3,
drop_trailing_silence=False,
strict_min_dur=False,
scale_signal=True,
show=True,
figsize=None,
save_as=None,
dpi=120,
theme="auditok",
**kwargs
):
"""Split region and plot signal and detections. Alias: :meth:`splitp`.
See :func:`auditok.split()` for a comprehensive description of split
parameters. Also see :meth:`plot` for plot parameters.
"""
try:
from auditok.plotting import plot
regions = self.split(
min_dur=min_dur,
max_dur=max_dur,
max_silence=max_silence,
drop_trailing_silence=drop_trailing_silence,
strict_min_dur=strict_min_dur,
**kwargs
)
regions = list(regions)
detections = ((reg.meta.start, reg.meta.end) for reg in regions)
eth = kwargs.get(
"energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
)
plot(
self,
scale_signal=scale_signal,
detections=detections,
energy_threshold=eth,
show=show,
figsize=figsize,
save_as=save_as,
dpi=dpi,
theme=theme,
)
return regions
except ImportError:
raise RuntimeWarning("Plotting requires matplotlib")
def __array__(self):
return self.samples
@property
def samples(self):
"""Audio region as arrays of samples, one array per channel."""
if self._samples is None:
self._samples = signal.to_array(
self._data, self.sample_width, self.channels
)
return self._samples
def __len__(self):
"""
Return region length in number of samples.
"""
return len(self._data) // (self.sample_width * self.channels)
@property
def len(self):
"""
Return region length in number of samples.
"""
return len(self)
def __bytes__(self):
return self._data
def __str__(self):
return (
"AudioRegion(duration={:.3f}, "
"sampling_rate={}, sample_width={}, channels={})".format(
self.duration, self.sr, self.sw, self.ch
)
)
def __repr__(self):
return str(self)
def __add__(self, other):
"""
Concatenates this region and `other` and return a new region.
Both regions must have the same sampling rate, sample width
and number of channels. If not, raises a `ValueError`.
"""
if not isinstance(other, AudioRegion):
raise TypeError(
"Can only concatenate AudioRegion, "
'not "{}"'.format(type(other))
)
if other.sr != self.sr:
raise ValueError(
"Can only concatenate AudioRegions of the same "
"sampling rate ({} != {})".format(self.sr, other.sr)
)
if other.sw != self.sw:
raise ValueError(
"Can only concatenate AudioRegions of the same "
"sample width ({} != {})".format(self.sw, other.sw)
)
if other.ch != self.ch:
raise ValueError(
"Can only concatenate AudioRegions of the same "
"number of channels ({} != {})".format(self.ch, other.ch)
)
data = self._data + other._data
return AudioRegion(data, self.sr, self.sw, self.ch)
def __radd__(self, other):
"""
Concatenates `other` and this region. `other` should be an
`AudioRegion` with the same audio parameters as this region
but can exceptionally be `0` to make it possible to concatenate
many regions with `sum`.
"""
if other == 0:
return self
return other.add(self)
def __mul__(self, n):
if not isinstance(n, int):
err_msg = "Can't multiply AudioRegion by a non-int of type '{}'"
raise TypeError(err_msg.format(type(n)))
data = self._data * n
return AudioRegion(data, self.sr, self.sw, self.ch)
def __rmul__(self, n):
return self * n
def __truediv__(self, n):
if not isinstance(n, int) or n <= 0:
raise TypeError("AudioRegion can only be divided by a positive int")
samples_per_sub_region, rest = divmod(len(self), n)
onset = 0
sub_regions = []
while onset < len(self):
offset = 0
if rest > 0:
offset = 1
rest -= 1
offset += onset + samples_per_sub_region
sub_regions.append(self[onset:offset])
onset = offset
return sub_regions
def __eq__(self, other):
if other is self:
return True
if not isinstance(other, AudioRegion):
return False
return (
(self._data == other._data)
and (self.sr == other.sr)
and (self.sw == other.sw)
and (self.ch == other.ch)
)
def __getitem__(self, index):
err_msg = "Slicing AudioRegion by samples requires indices of type "
err_msg += "'int' without a step (e.g. region.sec[1600:3200])"
start_sample, stop_sample = _check_convert_index(index, (int), err_msg)
bytes_per_sample = self.sample_width * self.channels
len_samples = len(self._data) // bytes_per_sample
if start_sample < 0:
start_sample = max(start_sample + len_samples, 0)
onset = start_sample * bytes_per_sample
if stop_sample is not None:
if stop_sample < 0:
stop_sample = max(stop_sample + len_samples, 0)
offset = index.stop * bytes_per_sample
else:
offset = None
data = self._data[onset:offset]
return AudioRegion(data, self.sr, self.sw, self.ch)
class StreamTokenizer:
"""
Class for stream tokenizers. It implements a 4-state automaton scheme
to extract sub-sequences of interest on the fly.
Parameters
----------
validator : callable, DataValidator (must implement `is_valid`)
called with each data frame read from source. Should take one positional
argument and return True or False for valid and invalid frames
respectively.
min_length : int
Minimum number of frames of a valid token. This includes all
tolerated non valid frames within the token.
max_length : int
Maximum number of frames of a valid token. This includes all
tolerated non valid frames within the token.
max_continuous_silence : int
Maximum number of consecutive non-valid frames within a token.
Note that, within a valid token, there may be many tolerated
*silent* regions that contain each a number of non valid frames up
to `max_continuous_silence`
init_min : int
Minimum number of consecutive valid frames that must be
**initially** gathered before any sequence of non valid frames can
be tolerated. This option is not always needed, it can be used to
drop non-valid tokens as early as possible. **Default = 0** means
that the option is by default ineffective.
init_max_silence : int
Maximum number of tolerated consecutive non-valid frames if the
number already gathered valid frames has not yet reached
'init_min'.This argument is normally used if `init_min` is used.
**Default = 0**, by default this argument is not taken into
consideration.
mode : int
mode can be one of the following:
-1 `StreamTokenizer.NORMAL` : do not drop trailing silence, and
accept a token shorter than `min_length` if it is the continuation
of the latest delivered token.
-2 `StreamTokenizer.STRICT_MIN_LENGTH`: if token `i` is delivered
because `max_length` is reached, and token `i+1` is immediately
adjacent to token `i` (i.e. token `i` ends at frame `k` and token
`i+1` starts at frame `k+1`) then accept token `i+1` only of it has
a size of at least `min_length`. The default behavior is to accept
token `i+1` event if it is shorter than `min_length` (provided that
the above conditions are fulfilled of course).
-3 `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing
non-valid frames from a token to be delivered if and only if it
is not **truncated**. This can be a bit tricky. A token is actually
delivered if:
- `max_continuous_silence` is reached.
- Its length reaches `max_length`. This is referred to as a
**truncated** token.
In the current implementation, a `StreamTokenizer`'s decision is only
based on already seen data and on incoming data. Thus, if a token is
truncated at a non-valid but tolerated frame (`max_length` is reached
but `max_continuous_silence` not yet) any tailing silence will be kept
because it can potentially be part of valid token (if `max_length` was
bigger). But if `max_continuous_silence` is reached before
`max_length`, the delivered token will not be considered as truncated
but a result of *normal* end of detection (i.e. no more valid data).
In that case the trailing silence can be removed if you use the
`StreamTokenizer.DROP_TRAILING_SILENCE` mode.
-4 `(StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE)`:
use both options. That means: first remove tailing silence, then
check if the token still has a length of at least `min_length`.
Examples
--------
In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
accepted although it is shorter than `min_length` (3), because it
immediately follows the latest delivered token:
>>> from auditok.core import StreamTokenizer
>>> from StringDataSource, DataValidator
>>> class UpperCaseChecker(DataValidator):
>>> def is_valid(self, frame):
return frame.isupper()
>>> dsource = StringDataSource("aaaAAAABBbbb")
>>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
min_length=3,
max_length=4,
max_continuous_silence=0)
>>> tokenizer.tokenize(dsource)
[(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
The following tokenizer will however reject the 'BB' token:
>>> dsource = StringDataSource("aaaAAAABBbbb")
>>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
min_length=3, max_length=4,
max_continuous_silence=0,
mode=StreamTokenizer.STRICT_MIN_LENGTH)
>>> tokenizer.tokenize(dsource)
[(['A', 'A', 'A', 'A'], 3, 6)]
>>> tokenizer = StreamTokenizer(
>>> validator=UpperCaseChecker(),
>>> min_length=3,
>>> max_length=6,
>>> max_continuous_silence=3,
>>> mode=StreamTokenizer.DROP_TRAILING_SILENCE
>>> )
>>> dsource = StringDataSource("aaaAAAaaaBBbbbb")
>>> tokenizer.tokenize(dsource)
[(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
The first token is delivered with its tailing silence because it is
truncated while the second one has its tailing frames removed.
Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
.. code:: python
[
(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8),
(['B', 'B', 'b', 'b', 'b'], 9, 13)
]
"""
SILENCE = 0
POSSIBLE_SILENCE = 1
POSSIBLE_NOISE = 2
NOISE = 3
NORMAL = 0
STRICT_MIN_LENGTH = 2
DROP_TRAILING_SILENCE = 4
def __init__(
self,
validator,
min_length,
max_length,
max_continuous_silence,
init_min=0,
init_max_silence=0,
mode=0,
):
if callable(validator):
self._is_valid = validator
elif isinstance(validator, DataValidator):
self._is_valid = validator.is_valid
else:
raise TypeError(
"'validator' must be a callable or an instance of "
"DataValidator"
)
if max_length <= 0:
raise ValueError(
"'max_length' must be > 0 (value={0})".format(max_length)
)
if min_length <= 0 or min_length > max_length:
err_msg = "'min_length' must be > 0 and <= 'max_length' (value={0})"
raise ValueError(err_msg.format(min_length))
if max_continuous_silence >= max_length:
err_msg = "'max_continuous_silence' must be < 'max_length' "
err_msg += "(value={0})"
raise ValueError(err_msg.format(max_continuous_silence))
if init_min >= max_length:
raise ValueError(
"'init_min' must be < 'max_length' (value={0})".format(
max_continuous_silence
)
)
self.validator = validator
self.min_length = min_length
self.max_length = max_length
self.max_continuous_silence = max_continuous_silence
self.init_min = init_min
self.init_max_silent = init_max_silence
self._set_mode(mode)
self._deliver = None
self._tokens = None
self._state = None
self._data = None
self._contiguous_token = False
self._init_count = 0
self._silence_length = 0
self._start_frame = 0
self._current_frame = 0
def _set_mode(self, mode):
strict_min_and_drop_trailing = StreamTokenizer.STRICT_MIN_LENGTH
strict_min_and_drop_trailing |= StreamTokenizer.DROP_TRAILING_SILENCE
if mode not in [
StreamTokenizer.NORMAL,
StreamTokenizer.STRICT_MIN_LENGTH,
StreamTokenizer.DROP_TRAILING_SILENCE,
strict_min_and_drop_trailing,
]:
raise ValueError("Wrong value for mode")
self._mode = mode
self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
self._drop_trailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
def _reinitialize(self):
self._contiguous_token = False
self._data = []
self._tokens = []
self._state = self.SILENCE
self._current_frame = -1
self._deliver = self._append_token
def tokenize(self, data_source, callback=None, generator=False):
"""
Read data from `data_source`, one frame a time, and process the read
frames in order to detect sequences of frames that make up valid
tokens.
:Parameters:
`data_source` : instance of the :class:`DataSource` class that
implements a `read` method. 'read' should return a slice of
signal, i.e. frame (of whatever type as long as it can be
processed by validator) and None if there is no more signal.
`callback` : an optional 3-argument function.
If a `callback` function is given, it will be called each time
a valid token is found.
:Returns:
A list of tokens if `callback` is None. Each token is tuple with the
following elements:
.. code python
(data, start, end)
where `data` is a list of read frames, `start`: index of the first
frame in the original data and `end` : index of the last frame.
"""
token_gen = self._iter_tokens(data_source)
if callback:
for token in token_gen:
callback(*token)
return
if generator:
return token_gen
return list(token_gen)
def _iter_tokens(self, data_source):
self._reinitialize()
while True:
frame = data_source.read()
self._current_frame += 1
if frame is None:
token = self._post_process()
if token is not None:
yield token
break
token = self._process(frame)
if token is not None:
yield token
def _process(self, frame): # noqa: C901
frame_is_valid = self._is_valid(frame)
if self._state == self.SILENCE:
if frame_is_valid:
# seems we got a valid frame after a silence
self._init_count = 1
self._silence_length = 0
self._start_frame = self._current_frame
self._data.append(frame)
if self._init_count >= self.init_min:
self._state = self.NOISE
if len(self._data) >= self.max_length:
return self._process_end_of_detection(True)
else:
self._state = self.POSSIBLE_NOISE
elif self._state == self.POSSIBLE_NOISE:
if frame_is_valid:
self._silence_length = 0
self._init_count += 1
self._data.append(frame)
if self._init_count >= self.init_min:
self._state = self.NOISE
if len(self._data) >= self.max_length:
return self._process_end_of_detection(True)
else:
self._silence_length += 1
if (
self._silence_length > self.init_max_silent
or len(self._data) + 1 >= self.max_length
):
# either init_max_silent or max_length is reached
# before _init_count, back to silence
self._data = []
self._state = self.SILENCE
else:
self._data.append(frame)
elif self._state == self.NOISE:
if frame_is_valid:
self._data.append(frame)
if len(self._data) >= self.max_length:
return self._process_end_of_detection(True)
elif self.max_continuous_silence <= 0:
# max token reached at this frame will _deliver if
# _contiguous_token and not _strict_min_length
self._state = self.SILENCE
return self._process_end_of_detection()
else:
# this is the first silent frame following a valid one
# and it is tolerated
self._silence_length = 1
self._data.append(frame)
self._state = self.POSSIBLE_SILENCE
if len(self._data) == self.max_length:
return self._process_end_of_detection(True)
# don't reset _silence_length because we still
# need to know the total number of silent frames
elif self._state == self.POSSIBLE_SILENCE:
if frame_is_valid:
self._data.append(frame)
self._silence_length = 0
self._state = self.NOISE
if len(self._data) >= self.max_length:
return self._process_end_of_detection(True)
else:
if self._silence_length >= self.max_continuous_silence:
self._state = self.SILENCE
if self._silence_length < len(self._data):
# _deliver only gathered frames aren't all silent
return self._process_end_of_detection()
self._data = []
self._silence_length = 0
else:
self._data.append(frame)
self._silence_length += 1
if len(self._data) >= self.max_length:
return self._process_end_of_detection(True)
# don't reset _silence_length because we still
# need to know the total number of silent frames
def _post_process(self):
if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE:
if len(self._data) > 0 and len(self._data) > self._silence_length:
return self._process_end_of_detection()
def _process_end_of_detection(self, truncated=False):
if (
not truncated
and self._drop_trailing_silence
and self._silence_length > 0
):
# happens if max_continuous_silence is reached
# or max_length is reached at a silent frame
self._data = self._data[0 : -self._silence_length]
if (len(self._data) >= self.min_length) or (
len(self._data) > 0
and not self._strict_min_length
and self._contiguous_token
):
start_frame = self._start_frame
end_frame = self._start_frame + len(self._data) - 1
data = self._data
self._data = []
token = (data, start_frame, end_frame)
if truncated:
# next token (if any) will start at _current_frame + 1
self._start_frame = self._current_frame + 1
# remember that it is contiguous with the just delivered one
self._contiguous_token = True
else:
self._contiguous_token = False
return token
else:
self._contiguous_token = False
self._data = []
def _append_token(self, data, start, end):
self._tokens.append((data, start, end))