bazarr/libs/rebulk/test/test_rebulk.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# pylint: disable=no-self-use, pointless-statement, missing-docstring, no-member, len-as-condition

from ..rebulk import Rebulk
from ..rules import Rule
from . import rebulk_rules_module as rm


def test_rebulk_simple():
    rebulk = Rebulk()

    rebulk.string("quick")
    rebulk.regex("f.x")

    def func(input_string):
        i = input_string.find("over")
        if i > -1:
            return i, i + len("over")

    rebulk.functional(func)

    input_string = "The quick brown fox jumps over the lazy dog"

    matches = rebulk.matches(input_string)
    assert len(matches) == 3

    assert matches[0].value == "quick"
    assert matches[1].value == "fox"
    assert matches[2].value == "over"


def test_rebulk_composition():
    rebulk = Rebulk()

    rebulk.string("quick")
    rebulk.rebulk(Rebulk().regex("f.x"))

    rebulk.rebulk(Rebulk(disabled=lambda context: True).functional(lambda string: None))

    input_string = "The quick brown fox jumps over the lazy dog"

    matches = rebulk.matches(input_string)
    assert len(matches) == 2

    assert matches[0].value == "quick"
    assert matches[1].value == "fox"


def test_rebulk_context():
    rebulk = Rebulk()

    context = {'nostring': True, 'word': 'lazy'}

    rebulk.string("quick", disabled=lambda context: context.get('nostring', False))
    rebulk.regex("f.x", disabled=lambda context: context.get('noregex', False))

    def func(input_string, context):
        word = context.get('word', 'over')
        i = input_string.find(word)
        if i > -1:
            return i, i + len(word)

    rebulk.functional(func)

    input_string = "The quick brown fox jumps over the lazy dog"

    matches = rebulk.matches(input_string, context)
    assert len(matches) == 2

    assert matches[0].value == "fox"
    assert matches[1].value == "lazy"


def test_rebulk_prefer_longer():
    input_string = "The quick brown fox jumps over the lazy dog"

    matches = Rebulk().string("quick").string("own").regex("br.{2}n").matches(input_string)

    assert len(matches) == 2

    assert matches[0].value == "quick"
    assert matches[1].value == "brown"


def test_rebulk_defaults():
    input_string = "The quick brown fox jumps over the lazy dog"

    def func(input_string):
        i = input_string.find("fox")
        if i > -1:
            return i, i + len("fox")

    matches = Rebulk()\
        .string_defaults(name="string", tags=["a", "b"])\
        .regex_defaults(name="regex") \
        .functional_defaults(name="functional") \
        .string("quick", tags=["c"])\
        .functional(func)\
        .regex("br.{2}n") \
        .matches(input_string)
    assert matches[0].name == "string"
    assert matches[0].tags == ["a", "b", "c"]
    assert matches[1].name == "functional"
    assert matches[2].name == "regex"

    matches = Rebulk() \
        .defaults(name="default", tags=["0"])\
        .string_defaults(name="string", tags=["a", "b"]) \
        .functional_defaults(name="functional", tags=["1"]) \
        .string("quick", tags=["c"]) \
        .functional(func) \
        .regex("br.{2}n") \
        .matches(input_string)
    assert matches[0].name == "string"
    assert matches[0].tags == ["0", "a", "b", "c"]
    assert matches[1].name == "functional"
    assert matches[1].tags == ["0", "1"]
    assert matches[2].name == "default"
    assert matches[2].tags == ["0"]


def test_rebulk_defaults_overrides():
    input_string = "The quick brown fox jumps over the lazy dog"

    def func(input_string):
        i = input_string.find("fox")
        if i > -1:
            return i, i + len("fox")

    matches = Rebulk() \
        .string_defaults(name="string", tags=["a", "b"]) \
        .regex_defaults(name="regex", tags=["d"]) \
        .functional_defaults(name="functional") \
        .string("quick", tags=["c"], overrides=["tags"]) \
        .functional(func) \
        .regex("br.{2}n") \
        .matches(input_string)
    assert matches[0].name == "string"
    assert matches[0].tags == ["c"]
    assert matches[1].name == "functional"
    assert matches[2].name == "regex"
    assert matches[2].tags == ["d"]

    matches = Rebulk() \
        .defaults(name="default", tags=["0"]) \
        .string_defaults(name="string", tags=["a", "b"]) \
        .functional_defaults(name="functional", tags=["1"]) \
        .string("quick", tags=["c"]) \
        .functional(func) \
        .regex("br.{2}n") \
        .matches(input_string)
    assert matches[0].name == "string"
    assert matches[0].tags == ["0", "a", "b", "c"]
    assert matches[1].name == "functional"
    assert matches[1].tags == ["0", "1"]
    assert matches[2].name == "default"
    assert matches[2].tags == ["0"]


def test_rebulk_rebulk():
    input_string = "The quick brown fox jumps over the lazy dog"

    base = Rebulk().string("quick")
    child = Rebulk().string("own").regex("br.{2}n")

    matches = base.rebulk(child).matches(input_string)

    assert len(matches) == 2

    assert matches[0].value == "quick"
    assert matches[1].value == "brown"


def test_rebulk_no_default():
    input_string = "The quick brown fox jumps over the lazy dog"

    matches = Rebulk(default_rules=False).string("quick").string("own").regex("br.{2}n").matches(input_string)

    assert len(matches) == 3

    assert matches[0].value == "quick"
    assert matches[1].value == "own"
    assert matches[2].value == "brown"


def test_rebulk_empty_match():
    input_string = "The quick brown fox jumps over the lazy dog"

    matches = Rebulk(default_rules=False).string("quick").string("own").regex("br(.*?)own", children=True)\
        .matches(input_string)

    assert len(matches) == 2

    assert matches[0].value == "quick"
    assert matches[1].value == "own"


def test_rebulk_tags_names():
    rebulk = Rebulk()

    rebulk.string("quick", name="str", tags=["first", "other"])
    rebulk.regex("f.x", tags="other")

    def func(input_string):
        i = input_string.find("over")
        if i > -1:
            return i, i + len("over"), {'tags': ['custom']}

    rebulk.functional(func, name="fn")

    def func2(input_string):
        i = input_string.find("lazy")
        if i > -1:
            return {'start': i, 'end': i + len("lazy"), 'tags': ['custom']}

    rebulk.functional(func2, name="fn")

    input_string = "The quick brown fox jumps over the lazy dog"

    matches = rebulk.matches(input_string)
    assert len(matches) == 4

    assert len(matches.named("str")) == 1
    assert len(matches.named("fn")) == 2
    assert len(matches.named("false")) == 0
    assert len(matches.tagged("false")) == 0
    assert len(matches.tagged("first")) == 1
    assert len(matches.tagged("other")) == 2
    assert len(matches.tagged("custom")) == 2


def test_rebulk_rules_1():
    rebulk = Rebulk()

    rebulk.regex(r'\d{4}', name="year")
    rebulk.rules(rm.RemoveAllButLastYear)

    matches = rebulk.matches("1984 keep only last 1968 entry 1982 case")
    assert len(matches) == 1
    assert matches[0].value == "1982"


def test_rebulk_rules_2():
    rebulk = Rebulk()

    rebulk.regex(r'\d{4}', name="year")
    rebulk.string(r'year', name="yearPrefix", private=True)
    rebulk.string(r'keep', name="yearSuffix", private=True)
    rebulk.rules(rm.PrefixedSuffixedYear)

    matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982")
    assert len(matches) == 2
    assert matches[0].value == "1984"
    assert matches[1].value == "1968"


def test_rebulk_rules_3():
    rebulk = Rebulk()

    rebulk.regex(r'\d{4}', name="year")
    rebulk.string(r'year', name="yearPrefix", private=True)
    rebulk.string(r'keep', name="yearSuffix", private=True)
    rebulk.rules(rm.PrefixedSuffixedYearNoLambda)

    matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982")
    assert len(matches) == 2
    assert matches[0].value == "1984"
    assert matches[1].value == "1968"


def test_rebulk_rules_4():
    class FirstOnlyRule(Rule):
        def when(self, matches, context):
            grabbed = matches.named("grabbed", 0)
            if grabbed and matches.previous(grabbed):
                return grabbed

        def then(self, matches, when_response, context):
            matches.remove(when_response)

    rebulk = Rebulk()

    rebulk.regex("This match (.*?)grabbed", name="grabbed")
    rebulk.regex("if it's (.*?)first match", private=True)

    rebulk.rules(FirstOnlyRule)

    matches = rebulk.matches("This match is grabbed only if it's the first match")
    assert len(matches) == 1
    assert matches[0].value == "This match is grabbed"

    matches = rebulk.matches("if it's NOT the first match, This match is NOT grabbed")
    assert len(matches) == 0


class TestMarkers(object):
    def test_one_marker(self):
        class MarkerRule(Rule):
            def when(self, matches, context):
                word_match = matches.named("word", 0)
                marker = matches.markers.at_match(word_match, lambda marker: marker.name == "mark1", 0)
                if not marker:
                    return word_match

            def then(self, matches, when_response, context):
                matches.remove(when_response)

        rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
            .regex(r'\[.*?\]', marker=True, name="mark2") \
            .string("word", name="word") \
            .rules(MarkerRule)

        matches = rebulk.matches("grab (word) only if it's in parenthesis")

        assert len(matches) == 1
        assert matches[0].value == "word"

        matches = rebulk.matches("don't grab [word] if it's in braket")
        assert len(matches) == 0

        matches = rebulk.matches("don't grab word at all")
        assert len(matches) == 0

    def test_multiple_marker(self):
        class MarkerRule(Rule):
            def when(self, matches, context):
                word_match = matches.named("word", 0)
                marker = matches.markers.at_match(word_match,
                                                  lambda marker: marker.name in ["mark1", "mark2"])
                if len(marker) < 2:
                    return word_match

            def then(self, matches, when_response, context):
                matches.remove(when_response)

        rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
            .regex(r'\[.*?\]', marker=True, name="mark2") \
            .regex("w.*?d", name="word") \
            .rules(MarkerRule)

        matches = rebulk.matches("[grab (word) only] if it's in parenthesis and brakets")

        assert len(matches) == 1
        assert matches[0].value == "word"

        matches = rebulk.matches("[don't grab](word)[if brakets are outside]")
        assert len(matches) == 0

        matches = rebulk.matches("(grab w[or)d even] if it's partially in parenthesis and brakets")
        assert len(matches) == 1
        assert matches[0].value == "w[or)d"

    def test_at_index_marker(self):
        class MarkerRule(Rule):
            def when(self, matches, context):
                word_match = matches.named("word", 0)
                marker = matches.markers.at_index(word_match.start,
                                                  lambda marker: marker.name == "mark1", 0)
                if not marker:
                    return word_match

            def then(self, matches, when_response, context):
                matches.remove(when_response)

        rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
            .regex("w.*?d", name="word") \
            .rules(MarkerRule)

        matches = rebulk.matches("gr(ab wo)rd only if starting of match is inside parenthesis")

        assert len(matches) == 1
        assert matches[0].value == "wo)rd"

        matches = rebulk.matches("don't grab wo(rd if starting of match is not inside parenthesis")

        assert len(matches) == 0

    def test_remove_marker(self):
        class MarkerRule(Rule):
            def when(self, matches, context):
                marker = matches.markers.named("mark1", 0)
                if marker:
                    return marker

            def then(self, matches, when_response, context):
                matches.markers.remove(when_response)

        rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
            .regex("w.*?d", name="word") \
            .rules(MarkerRule)

        matches = rebulk.matches("grab word event (if it's not) inside parenthesis")

        assert len(matches) == 1
        assert matches[0].value == "word"

        assert not matches.markers


class TestUnicode(object):
    def test_rebulk_simple(self):
        input_string = "敏捷的棕色狐狸跳過懶狗"

        rebulk = Rebulk()

        rebulk.string("敏")
        rebulk.regex("捷")

        def func(input_string):
            i = input_string.find("的")
            if i > -1:
                return i, i + len("的")

        rebulk.functional(func)

        matches = rebulk.matches(input_string)
        assert len(matches) == 3

        assert matches[0].value == "敏"
        assert matches[1].value == "捷"
        assert matches[2].value == "的"


class TestImmutable(object):
    def test_starting(self):
        input_string = "The quick brown fox jumps over the lazy dog"
        matches = Rebulk().string("quick").string("over").string("fox").matches(input_string)

        for i in range(0, len(input_string)):
            starting = matches.starting(i)
            for match in list(starting):
                starting.remove(match)

        assert len(matches) == 3

    def test_ending(self):
        input_string = "The quick brown fox jumps over the lazy dog"
        matches = Rebulk().string("quick").string("over").string("fox").matches(input_string)

        for i in range(0, len(input_string)):
            starting = matches.ending(i)
            for match in list(starting):
                starting.remove(match)

        assert len(matches) == 3

    def test_named(self):
        input_string = "The quick brown fox jumps over the lazy dog"
        matches = Rebulk().defaults(name='test').string("quick").string("over").string("fox").matches(input_string)

        named = matches.named('test')
        for match in list(named):
            named.remove(match)

        assert len(named) == 0
        assert len(matches) == 3