respect robots.txt
This commit is contained in:
parent
3b9d6d8582
commit
17d597168a
1 changed files with 16 additions and 4 deletions
20
tagtotoot.py
20
tagtotoot.py
|
@ -1,7 +1,9 @@
|
|||
"""Scrape mastodon tag searches and feed into your search."""
|
||||
import yaml
|
||||
import sys
|
||||
import urllib.robotparser
|
||||
import urllib.parse
|
||||
|
||||
import yaml
|
||||
import requests
|
||||
|
||||
|
||||
|
@ -10,9 +12,11 @@ def fetch_toots(conf):
|
|||
for instance in conf.get("instances", []):
|
||||
for tag in conf.get("tags", []):
|
||||
try:
|
||||
curr_posts = requests.get(
|
||||
f"https://{instance}/tags/{tag}.json"
|
||||
).json()
|
||||
uri = f"https://{instance}/tags/{tag}.json"
|
||||
if _check_path_allowed(uri):
|
||||
curr_posts = requests.get(
|
||||
uri
|
||||
).json()
|
||||
except Exception as e:
|
||||
print("Got some error fetching toots, continuing...")
|
||||
print(e)
|
||||
|
@ -34,6 +38,14 @@ def search_for(toots):
|
|||
print(e)
|
||||
|
||||
|
||||
def _check_path_allowed(uri):
|
||||
rp = urllib.robotparser.RobotFileParser()
|
||||
scheme, netloc, _, _, _, _ = urllib.parse.urlparse(uri)
|
||||
rp.set_url(f"{scheme}://{netloc}/robots.txt")
|
||||
rp.read()
|
||||
return rp.can_fetch('*', uri)
|
||||
|
||||
|
||||
def __check_lst_of_str(a):
|
||||
return bool(a) and isinstance(a, list) and all(isinstance(elem, str) for elem in a)
|
||||
|
||||
|
|
Loading…
Reference in a new issue