respect robots.txt
This commit is contained in:
parent
3b9d6d8582
commit
17d597168a
20
tagtotoot.py
20
tagtotoot.py
|
@ -1,7 +1,9 @@
|
||||||
"""Scrape mastodon tag searches and feed into your search."""
|
"""Scrape mastodon tag searches and feed into your search."""
|
||||||
import yaml
|
|
||||||
import sys
|
import sys
|
||||||
|
import urllib.robotparser
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
import yaml
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,9 +12,11 @@ def fetch_toots(conf):
|
||||||
for instance in conf.get("instances", []):
|
for instance in conf.get("instances", []):
|
||||||
for tag in conf.get("tags", []):
|
for tag in conf.get("tags", []):
|
||||||
try:
|
try:
|
||||||
curr_posts = requests.get(
|
uri = f"https://{instance}/tags/{tag}.json"
|
||||||
f"https://{instance}/tags/{tag}.json"
|
if _check_path_allowed(uri):
|
||||||
).json()
|
curr_posts = requests.get(
|
||||||
|
uri
|
||||||
|
).json()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Got some error fetching toots, continuing...")
|
print("Got some error fetching toots, continuing...")
|
||||||
print(e)
|
print(e)
|
||||||
|
@ -34,6 +38,14 @@ def search_for(toots):
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_path_allowed(uri):
|
||||||
|
rp = urllib.robotparser.RobotFileParser()
|
||||||
|
scheme, netloc, _, _, _, _ = urllib.parse.urlparse(uri)
|
||||||
|
rp.set_url(f"{scheme}://{netloc}/robots.txt")
|
||||||
|
rp.read()
|
||||||
|
return rp.can_fetch('*', uri)
|
||||||
|
|
||||||
|
|
||||||
def __check_lst_of_str(a):
|
def __check_lst_of_str(a):
|
||||||
return bool(a) and isinstance(a, list) and all(isinstance(elem, str) for elem in a)
|
return bool(a) and isinstance(a, list) and all(isinstance(elem, str) for elem in a)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue