mirror of https://github.com/morpheus65535/bazarr
89 lines
3.1 KiB
Python
89 lines
3.1 KiB
Python
|
"""
|
||
|
Implements a StreamTester that runs over Twitter data. See the class
|
||
|
docstring.
|
||
|
|
||
|
This module is written for Python 3 only. The __future__ imports you see here
|
||
|
are just to let Python 2 scan the file without crashing with a SyntaxError.
|
||
|
"""
|
||
|
from __future__ import print_function, unicode_literals
|
||
|
import os
|
||
|
from collections import defaultdict
|
||
|
from ftfy.streamtester import StreamTester
|
||
|
|
||
|
|
||
|
class TwitterTester(StreamTester):
|
||
|
"""
|
||
|
This class uses the StreamTester code (defined in `__init__.py`) to
|
||
|
evaluate ftfy's real-world performance, by feeding it live data from
|
||
|
Twitter.
|
||
|
|
||
|
This is a semi-manual evaluation. It requires a human to look at the
|
||
|
results and determine if they are good. The three possible cases we
|
||
|
can see here are:
|
||
|
|
||
|
- Success: the process takes in mojibake and outputs correct text.
|
||
|
- False positive: the process takes in correct text, and outputs
|
||
|
mojibake. Every false positive should be considered a bug, and
|
||
|
reported on GitHub if it isn't already.
|
||
|
- Confusion: the process takes in mojibake and outputs different
|
||
|
mojibake. Not a great outcome, but not as dire as a false
|
||
|
positive.
|
||
|
|
||
|
This tester cannot reveal false negatives. So far, that can only be
|
||
|
done by the unit tests.
|
||
|
"""
|
||
|
OUTPUT_DIR = './twitterlogs'
|
||
|
|
||
|
def __init__(self):
|
||
|
self.lines_by_lang = defaultdict(list)
|
||
|
super().__init__()
|
||
|
|
||
|
def save_files(self):
|
||
|
"""
|
||
|
When processing data from live Twitter, save it to log files so that
|
||
|
it can be replayed later.
|
||
|
"""
|
||
|
if not os.path.exists(self.OUTPUT_DIR):
|
||
|
os.makedirs(self.OUTPUT_DIR)
|
||
|
for lang, lines in self.lines_by_lang.items():
|
||
|
filename = 'tweets.{}.txt'.format(lang)
|
||
|
fullname = os.path.join(self.OUTPUT_DIR, filename)
|
||
|
langfile = open(fullname, 'a', encoding='utf-8')
|
||
|
for line in lines:
|
||
|
print(line.replace('\n', ' '), file=langfile)
|
||
|
langfile.close()
|
||
|
self.lines_by_lang = defaultdict(list)
|
||
|
|
||
|
def run_sample(self):
|
||
|
"""
|
||
|
Listen to live data from Twitter, and pass on the fully-formed tweets
|
||
|
to `check_ftfy`. This requires the `twitter` Python package as a
|
||
|
dependency.
|
||
|
"""
|
||
|
from twitter import TwitterStream
|
||
|
from ftfy.streamtester.oauth import get_auth
|
||
|
twitter_stream = TwitterStream(auth=get_auth())
|
||
|
iterator = twitter_stream.statuses.sample()
|
||
|
for tweet in iterator:
|
||
|
if 'text' in tweet:
|
||
|
self.check_ftfy(tweet['text'])
|
||
|
if 'user' in tweet:
|
||
|
lang = tweet['user'].get('lang', 'NONE')
|
||
|
self.lines_by_lang[lang].append(tweet['text'])
|
||
|
if self.count % 10000 == 100:
|
||
|
self.save_files()
|
||
|
|
||
|
|
||
|
def main():
|
||
|
"""
|
||
|
When run from the command line, this script connects to the Twitter stream
|
||
|
and runs the TwitterTester on it forever. Or at least until the stream
|
||
|
drops.
|
||
|
"""
|
||
|
tester = TwitterTester()
|
||
|
tester.run_sample()
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|