diff --git a/tagtotoot.py b/tagtotoot.py index 9a834e3..894bb40 100644 --- a/tagtotoot.py +++ b/tagtotoot.py @@ -1,7 +1,9 @@ """Scrape mastodon tag searches and feed into your search.""" -import yaml import sys +import urllib.robotparser +import urllib.parse +import yaml import requests @@ -10,9 +12,11 @@ def fetch_toots(conf): for instance in conf.get("instances", []): for tag in conf.get("tags", []): try: - curr_posts = requests.get( - f"https://{instance}/tags/{tag}.json" - ).json() + uri = f"https://{instance}/tags/{tag}.json" + if _check_path_allowed(uri): + curr_posts = requests.get( + uri + ).json() except Exception as e: print("Got some error fetching toots, continuing...") print(e) @@ -34,6 +38,14 @@ def search_for(toots): print(e) +def _check_path_allowed(uri): + rp = urllib.robotparser.RobotFileParser() + scheme, netloc, _, _, _, _ = urllib.parse.urlparse(uri) + rp.set_url(f"{scheme}://{netloc}/robots.txt") + rp.read() + return rp.can_fetch('*', uri) + + def __check_lst_of_str(a): return bool(a) and isinstance(a, list) and all(isinstance(elem, str) for elem in a)