bazarr/libs/ftfy/streamtester/twitter_tester.py

"""
Implements a StreamTester that runs over Twitter data. See the class
docstring.

This module is written for Python 3 only. The __future__ imports you see here
are just to let Python 2 scan the file without crashing with a SyntaxError.
"""
from __future__ import print_function, unicode_literals
import os
from collections import defaultdict
from ftfy.streamtester import StreamTester


class TwitterTester(StreamTester):
    """
    This class uses the StreamTester code (defined in `__init__.py`) to
    evaluate ftfy's real-world performance, by feeding it live data from
    Twitter.

    This is a semi-manual evaluation. It requires a human to look at the
    results and determine if they are good. The three possible cases we
    can see here are:

        - Success: the process takes in mojibake and outputs correct text.
        - False positive: the process takes in correct text, and outputs
          mojibake. Every false positive should be considered a bug, and
          reported on GitHub if it isn't already.
        - Confusion: the process takes in mojibake and outputs different
          mojibake. Not a great outcome, but not as dire as a false
          positive.

    This tester cannot reveal false negatives. So far, that can only be
    done by the unit tests.
    """
    OUTPUT_DIR = './twitterlogs'

    def __init__(self):
        self.lines_by_lang = defaultdict(list)
        super().__init__()

    def save_files(self):
        """
        When processing data from live Twitter, save it to log files so that
        it can be replayed later.
        """
        if not os.path.exists(self.OUTPUT_DIR):
            os.makedirs(self.OUTPUT_DIR)
        for lang, lines in self.lines_by_lang.items():
            filename = 'tweets.{}.txt'.format(lang)
            fullname = os.path.join(self.OUTPUT_DIR, filename)
            langfile = open(fullname, 'a', encoding='utf-8')
            for line in lines:
                print(line.replace('\n', ' '), file=langfile)
            langfile.close()
        self.lines_by_lang = defaultdict(list)

    def run_sample(self):
        """
        Listen to live data from Twitter, and pass on the fully-formed tweets
        to `check_ftfy`. This requires the `twitter` Python package as a
        dependency.
        """
        from twitter import TwitterStream
        from ftfy.streamtester.oauth import get_auth
        twitter_stream = TwitterStream(auth=get_auth())
        iterator = twitter_stream.statuses.sample()
        for tweet in iterator:
            if 'text' in tweet:
                self.check_ftfy(tweet['text'])
                if 'user' in tweet:
                    lang = tweet['user'].get('lang', 'NONE')
                    self.lines_by_lang[lang].append(tweet['text'])
                if self.count % 10000 == 100:
                    self.save_files()


def main():
    """
    When run from the command line, this script connects to the Twitter stream
    and runs the TwitterTester on it forever. Or at least until the stream
    drops.
    """
    tester = TwitterTester()
    tester.run_sample()


if __name__ == '__main__':
    main()
update deps 2018-10-31 16:08:29 +00:00			`"""`
			`Implements a StreamTester that runs over Twitter data. See the class`
			`docstring.`

			`This module is written for Python 3 only. The __future__ imports you see here`
			`are just to let Python 2 scan the file without crashing with a SyntaxError.`
			`"""`
			`from __future__ import print_function, unicode_literals`
			`import os`
			`from collections import defaultdict`
			`from ftfy.streamtester import StreamTester`


			`class TwitterTester(StreamTester):`
			`"""`
			This class uses the StreamTester code (defined in `__init__.py`) to
			`evaluate ftfy's real-world performance, by feeding it live data from`
			`Twitter.`

			`This is a semi-manual evaluation. It requires a human to look at the`
			`results and determine if they are good. The three possible cases we`
			`can see here are:`

			`- Success: the process takes in mojibake and outputs correct text.`
			`- False positive: the process takes in correct text, and outputs`
			`mojibake. Every false positive should be considered a bug, and`
			`reported on GitHub if it isn't already.`
			`- Confusion: the process takes in mojibake and outputs different`
			`mojibake. Not a great outcome, but not as dire as a false`
			`positive.`

			`This tester cannot reveal false negatives. So far, that can only be`
			`done by the unit tests.`
			`"""`
			`OUTPUT_DIR = './twitterlogs'`

			`def __init__(self):`
			`self.lines_by_lang = defaultdict(list)`
			`super().__init__()`

			`def save_files(self):`
			`"""`
			`When processing data from live Twitter, save it to log files so that`
			`it can be replayed later.`
			`"""`
			`if not os.path.exists(self.OUTPUT_DIR):`
			`os.makedirs(self.OUTPUT_DIR)`
			`for lang, lines in self.lines_by_lang.items():`
			`filename = 'tweets.{}.txt'.format(lang)`
			`fullname = os.path.join(self.OUTPUT_DIR, filename)`
			`langfile = open(fullname, 'a', encoding='utf-8')`
			`for line in lines:`
			`print(line.replace('\n', ' '), file=langfile)`
			`langfile.close()`
			`self.lines_by_lang = defaultdict(list)`

			`def run_sample(self):`
			`"""`
			`Listen to live data from Twitter, and pass on the fully-formed tweets`
			to `check_ftfy`. This requires the `twitter` Python package as a
			`dependency.`
			`"""`
			`from twitter import TwitterStream`
			`from ftfy.streamtester.oauth import get_auth`
			`twitter_stream = TwitterStream(auth=get_auth())`
			`iterator = twitter_stream.statuses.sample()`
			`for tweet in iterator:`
			`if 'text' in tweet:`
			`self.check_ftfy(tweet['text'])`
			`if 'user' in tweet:`
			`lang = tweet['user'].get('lang', 'NONE')`
			`self.lines_by_lang[lang].append(tweet['text'])`
			`if self.count % 10000 == 100:`
			`self.save_files()`


			`def main():`
			`"""`
			`When run from the command line, this script connects to the Twitter stream`
			`and runs the TwitterTester on it forever. Or at least until the stream`
			`drops.`
			`"""`
			`tester = TwitterTester()`
			`tester.run_sample()`


			`if __name__ == '__main__':`
			`main()`