#!/usr/bin/env python """Deduplicate repeated subtitles.""" import datetime import srt_tools.utils import logging import operator log = logging.getLogger(__name__) try: # Python 2 range = xrange # pytype: disable=name-error except NameError: pass def parse_args(): examples = { "Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt", "Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt", "Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt", } parser = srt_tools.utils.basic_parser( description=__doc__, examples=examples, ) parser.add_argument( "-t", "--ms", metavar="MILLISECONDS", default=datetime.timedelta(milliseconds=5000), type=lambda ms: datetime.timedelta(milliseconds=int(ms)), help="how many milliseconds distance a subtitle start time must be " "within of another to be considered a duplicate " "(default: 5000ms)", ) return parser.parse_args() def deduplicate_subs(orig_subs, acceptable_diff): """Remove subtitles with duplicated content.""" indices_to_remove = [] # If we only store the subtitle itself and compare that, it's possible that # we'll not only remove the duplicate, but also the _original_ subtitle if # they have the same sub index/times/etc. # # As such, we need to also store the index in the original subs list that # this entry belongs to for each subtitle prior to sorting. sorted_subs = sorted( enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start) ) for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False): cur_idx, cur_sub = subs[0] next_idx, next_sub = subs[1] if cur_sub.content == next_sub.content and ( not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start ): log.debug( "Marking l%d/s%d for removal, duplicate of l%d/s%d", next_idx, next_sub.index, cur_idx, cur_sub.index, ) indices_to_remove.append(next_idx) offset = 0 for idx in indices_to_remove: del orig_subs[idx - offset] offset += 1 def main(): args = parse_args() logging.basicConfig(level=args.log_level) srt_tools.utils.set_basic_args(args) subs = list(args.input) deduplicate_subs(subs, args.ms) output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict) try: args.output.write(output) except (UnicodeEncodeError, TypeError): # Python 2 fallback args.output.write(output.encode(args.encoding)) if __name__ == "__main__": # pragma: no cover main()