respect robots.txt

This commit is contained in:
chris 2022-12-12 02:46:11 +01:00
parent 3b9d6d8582
commit 17d597168a
1 changed files with 16 additions and 4 deletions

View File

@ -1,7 +1,9 @@
"""Scrape mastodon tag searches and feed into your search.""" """Scrape mastodon tag searches and feed into your search."""
import yaml
import sys import sys
import urllib.robotparser
import urllib.parse
import yaml
import requests import requests
@ -10,9 +12,11 @@ def fetch_toots(conf):
for instance in conf.get("instances", []): for instance in conf.get("instances", []):
for tag in conf.get("tags", []): for tag in conf.get("tags", []):
try: try:
curr_posts = requests.get( uri = f"https://{instance}/tags/{tag}.json"
f"https://{instance}/tags/{tag}.json" if _check_path_allowed(uri):
).json() curr_posts = requests.get(
uri
).json()
except Exception as e: except Exception as e:
print("Got some error fetching toots, continuing...") print("Got some error fetching toots, continuing...")
print(e) print(e)
@ -34,6 +38,14 @@ def search_for(toots):
print(e) print(e)
def _check_path_allowed(uri):
rp = urllib.robotparser.RobotFileParser()
scheme, netloc, _, _, _, _ = urllib.parse.urlparse(uri)
rp.set_url(f"{scheme}://{netloc}/robots.txt")
rp.read()
return rp.can_fetch('*', uri)
def __check_lst_of_str(a): def __check_lst_of_str(a):
return bool(a) and isinstance(a, list) and all(isinstance(elem, str) for elem in a) return bool(a) and isinstance(a, list) and all(isinstance(elem, str) for elem in a)