bazarr/libs/auditok/cmdline.py

#!/usr/bin/env python
# encoding: utf-8
"""
`auditok` -- An Audio Activity Detection tool

`auditok` is a program that can be used for Audio/Acoustic
activity detection. It can read audio data from audio files as well
as from the microphone or standard input.

@author:     Mohamed El Amine SEHILI
@copyright:  2015-2021 Mohamed El Amine SEHILI
@license:    MIT
@contact:    amine.sehili@gmail.com
@deffield    updated: 01 Mar 2021
"""

import sys
import os
from argparse import ArgumentParser
import time
import threading

from auditok import __version__, AudioRegion
from .util import AudioDataSource
from .exceptions import EndOfProcessing, AudioEncodingWarning
from .io import player_for
from .cmdline_util import make_logger, make_kwargs, initialize_workers
from . import workers


__all__ = []
__date__ = "2015-11-23"
__updated__ = "2021-03-01"


def main(argv=None):
    program_name = os.path.basename(sys.argv[0])
    if argv is None:
        argv = sys.argv[1:]
    try:
        parser = ArgumentParser(
            prog=program_name, description="An Audio Tokenization tool"
        )
        parser.add_argument(
            "--version", "-v", action="version", version=__version__
        )
        group = parser.add_argument_group("Input-Output options")
        group.add_argument(
            dest="input",
            help="Input audio or video file. Use '-' for stdin "
            "[default: read from microphone using pyaudio]",
            metavar="input",
            nargs="?",
            default=None,
        )
        group.add_argument(
            "-I",
            "--input-device-index",
            dest="input_device_index",
            help="Audio device index [default: %(default)s]. "
            "Optional and only effective when using PyAudio",
            type=int,
            default=None,
            metavar="INT",
        )
        group.add_argument(
            "-F",
            "--audio-frame-per-buffer",
            dest="frame_per_buffer",
            help="Audio frame per buffer [default: %(default)s]. "
            "Optional and only effective when using PyAudio",
            type=int,
            default=1024,
            metavar="INT",
        )
        group.add_argument(
            "-f",
            "--input-format",
            dest="input_format",
            type=str,
            default=None,
            help="Input audio file format. If not given, guess format from "
            "extension. If output file name has no extension, guess format "
            "from file header (requires pydub). If none of the previous is "
            "true, raise an error",
            metavar="STRING",
        )
        group.add_argument(
            "-M",
            "--max-read",
            dest="max_read",
            type=float,
            default=None,
            help="Maximum data (in seconds) to read from microphone or file "
            "[default: read until the end of file/stream]",
            metavar="FLOAT",
        )
        group.add_argument(
            "-L",
            "--large-file",
            dest="large_file",
            action="store_true",
            default=False,
            help="Whether input file should be treated as a large file. "
            "If True, data will be read from file on demand, otherwise all "
            "audio data is loaded to memory before tokenization.",
        )
        group.add_argument(
            "-O",
            "--save-stream",
            dest="save_stream",
            type=str,
            default=None,
            help="Save acquired audio data (from file or microphone) to disk."
            " If omitted no data will be saved. [default: omitted]",
            metavar="FILE",
        )
        group.add_argument(
            "-o",
            "--save-detections-as",
            dest="save_detections_as",
            type=str,
            default=None,
            help="File name format for detections."
            "The following placeholders can be used to build output file name "
            "for each detection: {id} (sequential, starts from 1), {start}, "
            "{end} and {duration}. Time placeholders are in seconds. "
            "Example: 'Event_{id}_{start}-{end}_{duration:.3f}.wav'",
            metavar="STRING",
        )
        group.add_argument(
            "-T",
            "--output-format",
            dest="output_format",
            type=str,
            default=None,
            help="Audio format used to save detections and/or main stream. "
            "If not supplied, then it will: (1. be guessed from extension or "
            "(2. use raw format",
            metavar="STRING",
        )
        group.add_argument(
            "-u",
            "--use-channel",
            dest="use_channel",
            type=str,
            default=None,
            help="Which channel to use for tokenization when input stream is "
            "multi-channel (0 is the first channel). Default is None, meaning "
            "that all channels will be considered for tokenization (i.e., get "
            "any valid audio event regardless of the channel it occurs in). "
            "This value can also be 'mix' (alias 'avg' or 'average') and "
            "means mix down all audio channels into one channel (i.e. compute "
            "average channel) and use the resulting channel for tokenization. "
            "Whatever option is used, saved audio events will contain the same"
            " number of channels as input stream. "
            "[Default: None, use all channels]",
            metavar="INT/STRING",
        )

        group = parser.add_argument_group(
            "Tokenization options", "Set tokenizer options."
        )
        group.add_argument(
            "-a",
            "--analysis-window",
            dest="analysis_window",
            default=0.01,
            type=float,
            help="Size of analysis window in seconds [default: %(default)s "
            "(10ms)]",
            metavar="FLOAT",
        )
        group.add_argument(
            "-n",
            "--min-duration",
            dest="min_duration",
            type=float,
            default=0.2,
            help="Min duration of a valid audio event in seconds "
            "[default: %(default)s]",
            metavar="FLOAT",
        )
        group.add_argument(
            "-m",
            "--max-duration",
            dest="max_duration",
            type=float,
            default=5,
            help="Max duration of a valid audio event in seconds "
            "[default: %(default)s]",
            metavar="FLOAT",
        )
        group.add_argument(
            "-s",
            "--max-silence",
            dest="max_silence",
            type=float,
            default=0.3,
            help="Max duration of a consecutive silence within a valid audio "
            "event in seconds [default: %(default)s]",
            metavar="FLOAT",
        )
        group.add_argument(
            "-d",
            "--drop-trailing-silence",
            dest="drop_trailing_silence",
            action="store_true",
            default=False,
            help="Drop trailing silence from a detection [default: keep "
            "trailing silence]",
        )
        group.add_argument(
            "-R",
            "--strict-min-duration",
            dest="strict_min_duration",
            action="store_true",
            default=False,
            help="Reject an event shorter than --min-duration even if it's "
            "adjacent to the latest valid event that reached max-duration "
            "[default: keep such events]",
        )
        group.add_argument(
            "-e",
            "--energy-threshold",
            dest="energy_threshold",
            type=float,
            default=50,
            help="Log energy threshold for detection [default: %(default)s]",
            metavar="FLOAT",
        )

        group = parser.add_argument_group(
            "Audio parameters",
            "Define audio parameters if data is read from a "
            "headerless file (raw or stdin) or you want to use "
            "different microphone parameters.",
        )
        group.add_argument(
            "-r",
            "--rate",
            dest="sampling_rate",
            type=int,
            default=16000,
            help="Sampling rate of audio data [default: %(default)s]",
            metavar="INT",
        )
        group.add_argument(
            "-c",
            "--channels",
            dest="channels",
            type=int,
            default=1,
            help="Number of channels of audio data [default: %(default)s]",
            metavar="INT",
        )
        group.add_argument(
            "-w",
            "--width",
            dest="sample_width",
            type=int,
            default=2,
            help="Number of bytes per audio sample [default: %(default)s]",
            metavar="INT",
        )

        group = parser.add_argument_group(
            "Do something with audio events",
            "Use these options to print, play back or plot detections.",
        )
        group.add_argument(
            "-C",
            "--command",
            dest="command",
            type=str,
            help="Command to call when an audio detection occurs. Use '{file}' "
            "as a placeholder for the temporary wav file that will contain "
            "event's data (e.g., \"-C 'du -h {file}'\" to print out file size "
            " or \"-C 'play -q {file}'\" to play audio with sox)",
            metavar="STRING",
        )
        group.add_argument(
            "-E",
            "--echo",
            dest="echo",
            action="store_true",
            default=False,
            help="Play back each detection immediately using pyaudio",
        )
        group.add_argument(
            "-B",
            "--progress-bar",
            dest="progress_bar",
            action="store_true",
            default=False,
            help="Show a progress bar when playing audio",
        )
        group.add_argument(
            "-p",
            "--plot",
            dest="plot",
            action="store_true",
            default=False,
            help="Plot and show audio signal and detections (requires "
            "matplotlib)",
        )
        group.add_argument(
            "--save-image",
            dest="save_image",
            type=str,
            help="Save plotted audio signal and detections as a picture or a "
            "PDF file (requires matplotlib)",
            metavar="FILE",
        )
        group.add_argument(
            "--printf",
            dest="printf",
            type=str,
            default="{id} {start} {end}",
            help="Print audio events information, one per line, using this "
            "format. Format can contain text with the following placeholders: "
            "{id} (sequential, starts from 1), {start}, {end}, {duration} and "
            "{timestamp}. The first 3 time placeholders are in seconds and "
            "their format can be set using --time-format argument. "
            "{timestamp} is the system timestamp (date and time) of the event "
            "and can be set using --timestamp-format argument.\n"
            "Example: '[{id}]: {start} -> {end} -- {timestamp}'",
            metavar="STRING",
        )
        group.add_argument(
            "--time-format",
            dest="time_format",
            type=str,
            default="%S",
            help="Format used to print {start}, {end} and {duration} "
            "placeholders used with --printf [default= %(default)s]. The "
            "following formats are accepted:\n"
            "%%S: absolute time in seconds. %%I: absolute time in ms. If at "
            "least one of (%%h, %%m, %%s, %%i) is used, convert time into "
            "hours, minutes, seconds and millis (e.g. %%h:%%m:%%s.%%i). Only "
            "supplied fields are printed. Note that %%S and %%I can only be "
            "used alone",
            metavar="STRING",
        )
        group.add_argument(
            "--timestamp-format",
            dest="timestamp_format",
            type=str,
            default="%Y/%m/%d %H:%M:%S",
            help="Format used to print {timestamp}. Should be a format "
            "accepted by 'datetime' standard module. Default: "
            "'%%Y/%%m/%%d %%H:%%M:%%S'",
        )
        parser.add_argument(
            "-q",
            "--quiet",
            dest="quiet",
            action="store_true",
            default=False,
            help="Do not print any information about detections [default: "
            "print 'id', 'start' and 'end' of each detection]",
        )
        parser.add_argument(
            "-D",
            "--debug",
            dest="debug",
            action="store_true",
            default=False,
            help="Print processing operations to STDOUT",
        )
        parser.add_argument(
            "--debug-file",
            dest="debug_file",
            type=str,
            default=None,
            help="Print processing operations to FILE",
            metavar="FILE",
        )

        args = parser.parse_args(argv)
        logger = make_logger(args.debug, args.debug_file)
        kwargs = make_kwargs(args)
        reader, observers = initialize_workers(
            logger=logger, **kwargs.io, **kwargs.miscellaneous
        )
        tokenizer_worker = workers.TokenizerWorker(
            reader, observers, logger=logger, **kwargs.split
        )
        tokenizer_worker.start_all()

        while True:
            time.sleep(1)
            if len(threading.enumerate()) == 1:
                raise EndOfProcessing

    except (KeyboardInterrupt, EndOfProcessing):
        if tokenizer_worker is not None:
            tokenizer_worker.stop_all()

            if isinstance(reader, workers.StreamSaverWorker):
                reader.join()
                try:
                    reader.save_stream()
                except AudioEncodingWarning as ae_warn:
                    print(str(ae_warn), file=sys.stderr)

            if args.plot or args.save_image is not None:
                from .plotting import plot

                reader.rewind()
                record = AudioRegion(
                    reader.data, reader.sr, reader.sw, reader.ch
                )
                detections = (
                    (det.start, det.end) for det in tokenizer_worker.detections
                )
                plot(
                    record,
                    detections=detections,
                    energy_threshold=args.energy_threshold,
                    show=True,
                    save_as=args.save_image,
                )
        return 0


if __name__ == "__main__":
    sys.exit(main(None))