mirror of
synced 2025-03-11 14:33:13 +00:00
FIX(#2266/#2267): Temporary workaround for cfscrape issues - Anorov/cloudflare-scrape#206
This commit is contained in:
1 changed files with 71 additions and 15 deletions
@ -2,17 +2,21 @@ import logging
import random
import re
import subprocess
from copy import deepcopy
from time import sleep
import copy
import time
from requests.sessions import Session
from collections import OrderedDict
from urlparse import urlparse
from urlparse import urlunparse
except ImportError:
from urllib.parse import urlparse
from urllib.parse import urlunparse
__version__ = "1.9.5"
__version__ = "1.9.7"
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
@ -24,8 +28,6 @@ DEFAULT_USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
Cloudflare may have changed their technique, or there may be a bug in the script.
@ -45,12 +47,13 @@ https://github.com/Anorov/cloudflare-scrape/issues\
class CloudflareScraper(Session):
def __init__(self, *args, **kwargs):
self.delay = kwargs.pop("delay", 8)
self.default_delay = 8
self.delay = kwargs.pop("delay", self.default_delay)
super(CloudflareScraper, self).__init__(*args, **kwargs)
if "requests" in self.headers["User-Agent"]:
# Set a random User-Agent if no custom User-Agent has been set
self.headers["User-Agent"] = DEFAULT_USER_AGENT
self.headers["User-Agent"] = random.choice(DEFAULT_USER_AGENTS)
def is_cloudflare_challenge(self, resp):
return (
@ -61,6 +64,19 @@ class CloudflareScraper(Session):
def request(self, method, url, *args, **kwargs):
self.headers = (
('User-Agent', self.headers['User-Agent']),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Accept-Language', 'en-US,en;q=0.5'),
('Accept-Encoding', 'gzip, deflate'),
('Connection', 'close'),
('Upgrade-Insecure-Requests', '1')
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
# Check if Cloudflare anti-bot is on
@ -70,22 +86,22 @@ class CloudflareScraper(Session):
return resp
def solve_cf_challenge(self, resp, **original_kwargs):
sleep(self.delay) # Cloudflare requires a delay before solving the challenge
start_time = time.time()
body = resp.text
parsed_url = urlparse(resp.url)
domain = parsed_url.netloc
submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain)
cloudflare_kwargs = deepcopy(original_kwargs)
cloudflare_kwargs = copy.deepcopy(original_kwargs)
params = cloudflare_kwargs.setdefault("params", {})
headers = cloudflare_kwargs.setdefault("headers", {})
headers["Referer"] = resp.url
params["s"] = re.search(r'name="s"\svalue="(?P<s_value>[^"]+)', body).group('s_value')
params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)
params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1)
except Exception as e:
# Something is wrong with the page.
# This may indicate Cloudflare has changed their anti-bot
@ -96,16 +112,28 @@ class CloudflareScraper(Session):
# Solve the Javascript challenge
params["jschl_answer"] = self.solve_challenge(body, domain)
# Check if the default delay has been overridden. If not, use the delay required by
# cloudflare.
if self.delay == self.default_delay:
self.delay = float(re.search(r"submit\(\);\r?\n\s*},\s*([0-9]+)", body).group(1)) / float(1000)
# Requests transforms any request into a GET after a redirect,
# so the redirect has to be handled manually here to allow for
# performing other types of requests even as the first request.
method = resp.request.method
cloudflare_kwargs["allow_redirects"] = False
redirect = self.request(method, submit_url, **cloudflare_kwargs)
end_time = time.time()
# Cloudflare requires a delay before solving the challenge
time.sleep(self.delay - (end_time - start_time))
redirect = self.request(method, submit_url, **cloudflare_kwargs)
redirect_location = urlparse(redirect.headers["Location"])
if not redirect_location.netloc:
redirect_url = "%s://%s%s" % (parsed_url.scheme, domain, redirect_location.path)
redirect_url = urlunparse((parsed_url.scheme, domain, redirect_location.path, redirect_location.params, redirect_location.query, redirect_location.fragment))
return self.request(method, redirect_url, **original_kwargs)
return self.request(method, redirect.headers["Location"], **original_kwargs)
@ -116,8 +144,15 @@ class CloudflareScraper(Session):
except Exception:
raise ValueError("Unable to identify Cloudflare IUAM Javascript on website. %s" % BUG_REPORT)
js = re.sub(r"a\.value = (.+ \+ t\.length).+", r"\1", js)
js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js).replace("t.length", str(len(domain)))
js = re.sub(r"a\.value = (.+\.toFixed\(10\);).+", r"\1", js)
# Match code that accesses the DOM and remove it, but without stripping too much.
solution_name = re.search("s,t,o,p,b,r,e,a,k,i,n,g,f,\s*(.+)\s*=", js).groups(1)
match = re.search("(.*};)\n\s*(t\s*=(.+))\n\s*(;%s.*)" % (solution_name), js, re.M | re.I | re.DOTALL).groups()
js = match[0] + match[-1]
except Exception:
raise ValueError("Error parsing Cloudflare IUAM Javascript challenge. %s" % BUG_REPORT)
js = js.replace("t.length", str(len(domain)))
# Strip characters that could be used to exit the string context
# These characters are not currently used in Cloudflare's arithmetic snippet
@ -126,9 +161,30 @@ class CloudflareScraper(Session):
if "toFixed" not in js:
raise ValueError("Error parsing Cloudflare IUAM Javascript challenge. %s" % BUG_REPORT)
# 2019-03-20: Cloudflare sometimes stores part of the challenge in a div which is later
# added using document.getElementById(x).innerHTML, so it is necessary to simulate that
# method and value.
# Find the id of the div in the javascript code.
k = re.search(r"k\s+=\s+'([^']+)';", body).group(1)
# Find the div with that id and store its content.
val = re.search(r'<div(.*)id="%s"(.*)>(.*)</div>' % (k), body).group(3)
except Exception:
# If not available, either the code has been modified again, or the old
# style challenge is used.
k = ''
val = ''
# Use vm.runInNewContext to safely evaluate code
# The sandboxed code cannot use the Node.js standard library
js = "console.log(require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000}));" % js
# Add the atob method which is now used by Cloudflares code, but is not available in all node versions.
simulate_document_js = 'var document= {getElementById: function(x) { return {innerHTML:"%s"};}}' % (val)
atob_js = 'var atob = function(str) {return Buffer.from(str, "base64").toString("binary");}'
# t is not defined, so we have to define it and set it to the domain name.
js = '%s;%s;var t="%s";%s' % (simulate_document_js,atob_js,domain,js)
buffer_js = "var Buffer = require('buffer').Buffer"
# Pass Buffer into the new context, so it is available for atob.
js = "%s;console.log(require('vm').runInNewContext('%s', {'Buffer':Buffer,'g':String.fromCharCode}, {timeout: 5000}));" % (buffer_js, js)
result = subprocess.check_output(["node", "-e", js]).strip()
Add table
Reference in a new issue