bazarr/libs/auditok/cmdline.py

429 lines
14 KiB
Python

#!/usr/bin/env python
# encoding: utf-8
"""
`auditok` -- An Audio Activity Detection tool
`auditok` is a program that can be used for Audio/Acoustic
activity detection. It can read audio data from audio files as well
as from the microphone or standard input.
@author: Mohamed El Amine SEHILI
@copyright: 2015-2021 Mohamed El Amine SEHILI
@license: MIT
@contact: amine.sehili@gmail.com
@deffield updated: 01 Mar 2021
"""
import sys
import os
from argparse import ArgumentParser
import time
import threading
from auditok import __version__, AudioRegion
from .util import AudioDataSource
from .exceptions import EndOfProcessing, AudioEncodingWarning
from .io import player_for
from .cmdline_util import make_logger, make_kwargs, initialize_workers
from . import workers
__all__ = []
__date__ = "2015-11-23"
__updated__ = "2021-03-01"
def main(argv=None):
program_name = os.path.basename(sys.argv[0])
if argv is None:
argv = sys.argv[1:]
try:
parser = ArgumentParser(
prog=program_name, description="An Audio Tokenization tool"
)
parser.add_argument(
"--version", "-v", action="version", version=__version__
)
group = parser.add_argument_group("Input-Output options")
group.add_argument(
dest="input",
help="Input audio or video file. Use '-' for stdin "
"[default: read from microphone using pyaudio]",
metavar="input",
nargs="?",
default=None,
)
group.add_argument(
"-I",
"--input-device-index",
dest="input_device_index",
help="Audio device index [default: %(default)s]. "
"Optional and only effective when using PyAudio",
type=int,
default=None,
metavar="INT",
)
group.add_argument(
"-F",
"--audio-frame-per-buffer",
dest="frame_per_buffer",
help="Audio frame per buffer [default: %(default)s]. "
"Optional and only effective when using PyAudio",
type=int,
default=1024,
metavar="INT",
)
group.add_argument(
"-f",
"--input-format",
dest="input_format",
type=str,
default=None,
help="Input audio file format. If not given, guess format from "
"extension. If output file name has no extension, guess format "
"from file header (requires pydub). If none of the previous is "
"true, raise an error",
metavar="STRING",
)
group.add_argument(
"-M",
"--max-read",
dest="max_read",
type=float,
default=None,
help="Maximum data (in seconds) to read from microphone or file "
"[default: read until the end of file/stream]",
metavar="FLOAT",
)
group.add_argument(
"-L",
"--large-file",
dest="large_file",
action="store_true",
default=False,
help="Whether input file should be treated as a large file. "
"If True, data will be read from file on demand, otherwise all "
"audio data is loaded to memory before tokenization.",
)
group.add_argument(
"-O",
"--save-stream",
dest="save_stream",
type=str,
default=None,
help="Save acquired audio data (from file or microphone) to disk."
" If omitted no data will be saved. [default: omitted]",
metavar="FILE",
)
group.add_argument(
"-o",
"--save-detections-as",
dest="save_detections_as",
type=str,
default=None,
help="File name format for detections."
"The following placeholders can be used to build output file name "
"for each detection: {id} (sequential, starts from 1), {start}, "
"{end} and {duration}. Time placeholders are in seconds. "
"Example: 'Event_{id}_{start}-{end}_{duration:.3f}.wav'",
metavar="STRING",
)
group.add_argument(
"-T",
"--output-format",
dest="output_format",
type=str,
default=None,
help="Audio format used to save detections and/or main stream. "
"If not supplied, then it will: (1. be guessed from extension or "
"(2. use raw format",
metavar="STRING",
)
group.add_argument(
"-u",
"--use-channel",
dest="use_channel",
type=str,
default=None,
help="Which channel to use for tokenization when input stream is "
"multi-channel (0 is the first channel). Default is None, meaning "
"that all channels will be considered for tokenization (i.e., get "
"any valid audio event regardless of the channel it occurs in). "
"This value can also be 'mix' (alias 'avg' or 'average') and "
"means mix down all audio channels into one channel (i.e. compute "
"average channel) and use the resulting channel for tokenization. "
"Whatever option is used, saved audio events will contain the same"
" number of channels as input stream. "
"[Default: None, use all channels]",
metavar="INT/STRING",
)
group = parser.add_argument_group(
"Tokenization options", "Set tokenizer options."
)
group.add_argument(
"-a",
"--analysis-window",
dest="analysis_window",
default=0.01,
type=float,
help="Size of analysis window in seconds [default: %(default)s "
"(10ms)]",
metavar="FLOAT",
)
group.add_argument(
"-n",
"--min-duration",
dest="min_duration",
type=float,
default=0.2,
help="Min duration of a valid audio event in seconds "
"[default: %(default)s]",
metavar="FLOAT",
)
group.add_argument(
"-m",
"--max-duration",
dest="max_duration",
type=float,
default=5,
help="Max duration of a valid audio event in seconds "
"[default: %(default)s]",
metavar="FLOAT",
)
group.add_argument(
"-s",
"--max-silence",
dest="max_silence",
type=float,
default=0.3,
help="Max duration of a consecutive silence within a valid audio "
"event in seconds [default: %(default)s]",
metavar="FLOAT",
)
group.add_argument(
"-d",
"--drop-trailing-silence",
dest="drop_trailing_silence",
action="store_true",
default=False,
help="Drop trailing silence from a detection [default: keep "
"trailing silence]",
)
group.add_argument(
"-R",
"--strict-min-duration",
dest="strict_min_duration",
action="store_true",
default=False,
help="Reject an event shorter than --min-duration even if it's "
"adjacent to the latest valid event that reached max-duration "
"[default: keep such events]",
)
group.add_argument(
"-e",
"--energy-threshold",
dest="energy_threshold",
type=float,
default=50,
help="Log energy threshold for detection [default: %(default)s]",
metavar="FLOAT",
)
group = parser.add_argument_group(
"Audio parameters",
"Define audio parameters if data is read from a "
"headerless file (raw or stdin) or you want to use "
"different microphone parameters.",
)
group.add_argument(
"-r",
"--rate",
dest="sampling_rate",
type=int,
default=16000,
help="Sampling rate of audio data [default: %(default)s]",
metavar="INT",
)
group.add_argument(
"-c",
"--channels",
dest="channels",
type=int,
default=1,
help="Number of channels of audio data [default: %(default)s]",
metavar="INT",
)
group.add_argument(
"-w",
"--width",
dest="sample_width",
type=int,
default=2,
help="Number of bytes per audio sample [default: %(default)s]",
metavar="INT",
)
group = parser.add_argument_group(
"Do something with audio events",
"Use these options to print, play back or plot detections.",
)
group.add_argument(
"-C",
"--command",
dest="command",
type=str,
help="Command to call when an audio detection occurs. Use '{file}' "
"as a placeholder for the temporary wav file that will contain "
"event's data (e.g., \"-C 'du -h {file}'\" to print out file size "
" or \"-C 'play -q {file}'\" to play audio with sox)",
metavar="STRING",
)
group.add_argument(
"-E",
"--echo",
dest="echo",
action="store_true",
default=False,
help="Play back each detection immediately using pyaudio",
)
group.add_argument(
"-B",
"--progress-bar",
dest="progress_bar",
action="store_true",
default=False,
help="Show a progress bar when playing audio",
)
group.add_argument(
"-p",
"--plot",
dest="plot",
action="store_true",
default=False,
help="Plot and show audio signal and detections (requires "
"matplotlib)",
)
group.add_argument(
"--save-image",
dest="save_image",
type=str,
help="Save plotted audio signal and detections as a picture or a "
"PDF file (requires matplotlib)",
metavar="FILE",
)
group.add_argument(
"--printf",
dest="printf",
type=str,
default="{id} {start} {end}",
help="Print audio events information, one per line, using this "
"format. Format can contain text with the following placeholders: "
"{id} (sequential, starts from 1), {start}, {end}, {duration} and "
"{timestamp}. The first 3 time placeholders are in seconds and "
"their format can be set using --time-format argument. "
"{timestamp} is the system timestamp (date and time) of the event "
"and can be set using --timestamp-format argument.\n"
"Example: '[{id}]: {start} -> {end} -- {timestamp}'",
metavar="STRING",
)
group.add_argument(
"--time-format",
dest="time_format",
type=str,
default="%S",
help="Format used to print {start}, {end} and {duration} "
"placeholders used with --printf [default= %(default)s]. The "
"following formats are accepted:\n"
"%%S: absolute time in seconds. %%I: absolute time in ms. If at "
"least one of (%%h, %%m, %%s, %%i) is used, convert time into "
"hours, minutes, seconds and millis (e.g. %%h:%%m:%%s.%%i). Only "
"supplied fields are printed. Note that %%S and %%I can only be "
"used alone",
metavar="STRING",
)
group.add_argument(
"--timestamp-format",
dest="timestamp_format",
type=str,
default="%Y/%m/%d %H:%M:%S",
help="Format used to print {timestamp}. Should be a format "
"accepted by 'datetime' standard module. Default: "
"'%%Y/%%m/%%d %%H:%%M:%%S'",
)
parser.add_argument(
"-q",
"--quiet",
dest="quiet",
action="store_true",
default=False,
help="Do not print any information about detections [default: "
"print 'id', 'start' and 'end' of each detection]",
)
parser.add_argument(
"-D",
"--debug",
dest="debug",
action="store_true",
default=False,
help="Print processing operations to STDOUT",
)
parser.add_argument(
"--debug-file",
dest="debug_file",
type=str,
default=None,
help="Print processing operations to FILE",
metavar="FILE",
)
args = parser.parse_args(argv)
logger = make_logger(args.debug, args.debug_file)
kwargs = make_kwargs(args)
reader, observers = initialize_workers(
logger=logger, **kwargs.io, **kwargs.miscellaneous
)
tokenizer_worker = workers.TokenizerWorker(
reader, observers, logger=logger, **kwargs.split
)
tokenizer_worker.start_all()
while True:
time.sleep(1)
if len(threading.enumerate()) == 1:
raise EndOfProcessing
except (KeyboardInterrupt, EndOfProcessing):
if tokenizer_worker is not None:
tokenizer_worker.stop_all()
if isinstance(reader, workers.StreamSaverWorker):
reader.join()
try:
reader.save_stream()
except AudioEncodingWarning as ae_warn:
print(str(ae_warn), file=sys.stderr)
if args.plot or args.save_image is not None:
from .plotting import plot
reader.rewind()
record = AudioRegion(
reader.data, reader.sr, reader.sw, reader.ch
)
detections = (
(det.start, det.end) for det in tokenizer_worker.detections
)
plot(
record,
detections=detections,
energy_threshold=args.energy_threshold,
show=True,
save_as=args.save_image,
)
return 0
if __name__ == "__main__":
sys.exit(main(None))