mirror of
https://github.com/evilhero/mylar
synced 2025-03-11 14:33:13 +00:00
Update cfscrape to latest version
https://github.com/Anorov/cloudflare-scrape
This commit is contained in:
parent
cbcb12516e
commit
6101943540
1 changed files with 21 additions and 13 deletions
|
@ -2,10 +2,9 @@ from time import sleep
|
|||
import logging
|
||||
import random
|
||||
import re
|
||||
import os
|
||||
from requests.sessions import Session
|
||||
import js2py
|
||||
from js2py import eval_js
|
||||
from copy import deepcopy
|
||||
|
||||
try:
|
||||
from urlparse import urlparse
|
||||
|
@ -35,13 +34,17 @@ class CloudflareScraper(Session):
|
|||
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
|
||||
|
||||
# Check if Cloudflare anti-bot is on
|
||||
if resp.status_code == 503 and resp.headers.get("Server") == "cloudflare-nginx":
|
||||
if ( resp.status_code == 503
|
||||
and resp.headers.get("Server") == "cloudflare-nginx"
|
||||
and b"jschl_vc" in resp.content
|
||||
and b"jschl_answer" in resp.content
|
||||
):
|
||||
return self.solve_cf_challenge(resp, **kwargs)
|
||||
|
||||
# Otherwise, no Cloudflare anti-bot detected
|
||||
return resp
|
||||
|
||||
def solve_cf_challenge(self, resp, **kwargs):
|
||||
def solve_cf_challenge(self, resp, **original_kwargs):
|
||||
sleep(5) # Cloudflare requires a delay before solving the challenge
|
||||
|
||||
body = resp.text
|
||||
|
@ -49,8 +52,9 @@ class CloudflareScraper(Session):
|
|||
domain = urlparse(resp.url).netloc
|
||||
submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain)
|
||||
|
||||
params = kwargs.setdefault("params", {})
|
||||
headers = kwargs.setdefault("headers", {})
|
||||
cloudflare_kwargs = deepcopy(original_kwargs)
|
||||
params = cloudflare_kwargs.setdefault("params", {})
|
||||
headers = cloudflare_kwargs.setdefault("headers", {})
|
||||
headers["Referer"] = resp.url
|
||||
|
||||
try:
|
||||
|
@ -73,10 +77,15 @@ class CloudflareScraper(Session):
|
|||
raise
|
||||
|
||||
# Safely evaluate the Javascript expression
|
||||
js = js.replace('return', '')
|
||||
params["jschl_answer"] = str(int(eval_js(js)) + len(domain))
|
||||
params["jschl_answer"] = str(int(js2py.eval_js(js)) + len(domain))
|
||||
|
||||
return self.get(submit_url, **kwargs)
|
||||
# Requests transforms any request into a GET after a redirect,
|
||||
# so the redirect has to be handled manually here to allow for
|
||||
# performing other types of requests even as the first request.
|
||||
method = resp.request.method
|
||||
cloudflare_kwargs["allow_redirects"] = False
|
||||
redirect = self.request(method, submit_url, **cloudflare_kwargs)
|
||||
return self.request(method, redirect.headers["Location"], **original_kwargs)
|
||||
|
||||
def extract_js(self, body):
|
||||
js = re.search(r"setTimeout\(function\(\){\s+(var "
|
||||
|
@ -88,14 +97,13 @@ class CloudflareScraper(Session):
|
|||
# These characters are not currently used in Cloudflare's arithmetic snippet
|
||||
js = re.sub(r"[\n\\']", "", js)
|
||||
|
||||
return js.replace("parseInt", "return parseInt")
|
||||
return js
|
||||
|
||||
@classmethod
|
||||
def create_scraper(cls, sess=None, **kwargs):
|
||||
"""
|
||||
Convenience function for creating a ready-to-go requests.Session (subclass) object.
|
||||
"""
|
||||
|
||||
scraper = cls()
|
||||
|
||||
if sess:
|
||||
|
@ -131,7 +139,7 @@ class CloudflareScraper(Session):
|
|||
cookie_domain = d
|
||||
break
|
||||
else:
|
||||
raise ValueError("Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM mode enabled?")
|
||||
raise ValueError("Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?")
|
||||
|
||||
return ({
|
||||
"__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain),
|
||||
|
@ -150,4 +158,4 @@ class CloudflareScraper(Session):
|
|||
|
||||
create_scraper = CloudflareScraper.create_scraper
|
||||
get_tokens = CloudflareScraper.get_tokens
|
||||
get_cookie_string = CloudflareScraper.get_cookie_string
|
||||
get_cookie_string = CloudflareScraper.get_cookie_string
|
Loading…
Add table
Reference in a new issue