respect robots.txt

This commit is contained in:
chris 2022-12-12 02:46:11 +01:00
parent 3b9d6d8582
commit 17d597168a

View file

@ -1,7 +1,9 @@
"""Scrape mastodon tag searches and feed into your search."""
import yaml
import sys
import urllib.robotparser
import urllib.parse
import yaml
import requests
@ -10,9 +12,11 @@ def fetch_toots(conf):
for instance in conf.get("instances", []):
for tag in conf.get("tags", []):
try:
curr_posts = requests.get(
f"https://{instance}/tags/{tag}.json"
).json()
uri = f"https://{instance}/tags/{tag}.json"
if _check_path_allowed(uri):
curr_posts = requests.get(
uri
).json()
except Exception as e:
print("Got some error fetching toots, continuing...")
print(e)
@ -34,6 +38,14 @@ def search_for(toots):
print(e)
def _check_path_allowed(uri):
rp = urllib.robotparser.RobotFileParser()
scheme, netloc, _, _, _, _ = urllib.parse.urlparse(uri)
rp.set_url(f"{scheme}://{netloc}/robots.txt")
rp.read()
return rp.can_fetch('*', uri)
def __check_lst_of_str(a):
return bool(a) and isinstance(a, list) and all(isinstance(elem, str) for elem in a)