FIX(#2266/#2267): Temporary workaround for cfscrape issues - Anorov/cloudflare-scrape#206

This commit is contained in:
davalanche 2019-05-02 08:43:46 -06:00 committed by evilhero
parent 14c933594b
commit 224c354c99
1 changed files with 71 additions and 15 deletions

View File

@ -2,17 +2,21 @@ import logging
import random import random
import re import re
import subprocess import subprocess
from copy import deepcopy import copy
from time import sleep import time
from requests.sessions import Session from requests.sessions import Session
from collections import OrderedDict
try: try:
from urlparse import urlparse from urlparse import urlparse
from urlparse import urlunparse
except ImportError: except ImportError:
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.parse import urlunparse
__version__ = "1.9.5" __version__ = "1.9.7"
DEFAULT_USER_AGENTS = [ DEFAULT_USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
@ -24,8 +28,6 @@ DEFAULT_USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0" "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
] ]
DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS)
BUG_REPORT = """\ BUG_REPORT = """\
Cloudflare may have changed their technique, or there may be a bug in the script. Cloudflare may have changed their technique, or there may be a bug in the script.
@ -45,12 +47,13 @@ https://github.com/Anorov/cloudflare-scrape/issues\
class CloudflareScraper(Session): class CloudflareScraper(Session):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.delay = kwargs.pop("delay", 8) self.default_delay = 8
self.delay = kwargs.pop("delay", self.default_delay)
super(CloudflareScraper, self).__init__(*args, **kwargs) super(CloudflareScraper, self).__init__(*args, **kwargs)
if "requests" in self.headers["User-Agent"]: if "requests" in self.headers["User-Agent"]:
# Set a random User-Agent if no custom User-Agent has been set # Set a random User-Agent if no custom User-Agent has been set
self.headers["User-Agent"] = DEFAULT_USER_AGENT self.headers["User-Agent"] = random.choice(DEFAULT_USER_AGENTS)
def is_cloudflare_challenge(self, resp): def is_cloudflare_challenge(self, resp):
return ( return (
@ -61,6 +64,19 @@ class CloudflareScraper(Session):
) )
def request(self, method, url, *args, **kwargs): def request(self, method, url, *args, **kwargs):
self.headers = (
OrderedDict(
[
('User-Agent', self.headers['User-Agent']),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Accept-Language', 'en-US,en;q=0.5'),
('Accept-Encoding', 'gzip, deflate'),
('Connection', 'close'),
('Upgrade-Insecure-Requests', '1')
]
)
)
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs) resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
# Check if Cloudflare anti-bot is on # Check if Cloudflare anti-bot is on
@ -70,22 +86,22 @@ class CloudflareScraper(Session):
return resp return resp
def solve_cf_challenge(self, resp, **original_kwargs): def solve_cf_challenge(self, resp, **original_kwargs):
sleep(self.delay) # Cloudflare requires a delay before solving the challenge start_time = time.time()
body = resp.text body = resp.text
parsed_url = urlparse(resp.url) parsed_url = urlparse(resp.url)
domain = parsed_url.netloc domain = parsed_url.netloc
submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain) submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain)
cloudflare_kwargs = deepcopy(original_kwargs) cloudflare_kwargs = copy.deepcopy(original_kwargs)
params = cloudflare_kwargs.setdefault("params", {}) params = cloudflare_kwargs.setdefault("params", {})
headers = cloudflare_kwargs.setdefault("headers", {}) headers = cloudflare_kwargs.setdefault("headers", {})
headers["Referer"] = resp.url headers["Referer"] = resp.url
try: try:
params["s"] = re.search(r'name="s"\svalue="(?P<s_value>[^"]+)', body).group('s_value')
params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1) params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)
params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1) params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1)
except Exception as e: except Exception as e:
# Something is wrong with the page. # Something is wrong with the page.
# This may indicate Cloudflare has changed their anti-bot # This may indicate Cloudflare has changed their anti-bot
@ -96,16 +112,28 @@ class CloudflareScraper(Session):
# Solve the Javascript challenge # Solve the Javascript challenge
params["jschl_answer"] = self.solve_challenge(body, domain) params["jschl_answer"] = self.solve_challenge(body, domain)
# Check if the default delay has been overridden. If not, use the delay required by
# cloudflare.
if self.delay == self.default_delay:
try:
self.delay = float(re.search(r"submit\(\);\r?\n\s*},\s*([0-9]+)", body).group(1)) / float(1000)
except:
pass
# Requests transforms any request into a GET after a redirect, # Requests transforms any request into a GET after a redirect,
# so the redirect has to be handled manually here to allow for # so the redirect has to be handled manually here to allow for
# performing other types of requests even as the first request. # performing other types of requests even as the first request.
method = resp.request.method method = resp.request.method
cloudflare_kwargs["allow_redirects"] = False cloudflare_kwargs["allow_redirects"] = False
redirect = self.request(method, submit_url, **cloudflare_kwargs)
end_time = time.time()
# Cloudflare requires a delay before solving the challenge
time.sleep(self.delay - (end_time - start_time))
redirect = self.request(method, submit_url, **cloudflare_kwargs)
redirect_location = urlparse(redirect.headers["Location"]) redirect_location = urlparse(redirect.headers["Location"])
if not redirect_location.netloc: if not redirect_location.netloc:
redirect_url = "%s://%s%s" % (parsed_url.scheme, domain, redirect_location.path) redirect_url = urlunparse((parsed_url.scheme, domain, redirect_location.path, redirect_location.params, redirect_location.query, redirect_location.fragment))
return self.request(method, redirect_url, **original_kwargs) return self.request(method, redirect_url, **original_kwargs)
return self.request(method, redirect.headers["Location"], **original_kwargs) return self.request(method, redirect.headers["Location"], **original_kwargs)
@ -116,8 +144,15 @@ class CloudflareScraper(Session):
except Exception: except Exception:
raise ValueError("Unable to identify Cloudflare IUAM Javascript on website. %s" % BUG_REPORT) raise ValueError("Unable to identify Cloudflare IUAM Javascript on website. %s" % BUG_REPORT)
js = re.sub(r"a\.value = (.+ \+ t\.length).+", r"\1", js) js = re.sub(r"a\.value = (.+\.toFixed\(10\);).+", r"\1", js)
js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js).replace("t.length", str(len(domain))) # Match code that accesses the DOM and remove it, but without stripping too much.
try:
solution_name = re.search("s,t,o,p,b,r,e,a,k,i,n,g,f,\s*(.+)\s*=", js).groups(1)
match = re.search("(.*};)\n\s*(t\s*=(.+))\n\s*(;%s.*)" % (solution_name), js, re.M | re.I | re.DOTALL).groups()
js = match[0] + match[-1]
except Exception:
raise ValueError("Error parsing Cloudflare IUAM Javascript challenge. %s" % BUG_REPORT)
js = js.replace("t.length", str(len(domain)))
# Strip characters that could be used to exit the string context # Strip characters that could be used to exit the string context
# These characters are not currently used in Cloudflare's arithmetic snippet # These characters are not currently used in Cloudflare's arithmetic snippet
@ -126,9 +161,30 @@ class CloudflareScraper(Session):
if "toFixed" not in js: if "toFixed" not in js:
raise ValueError("Error parsing Cloudflare IUAM Javascript challenge. %s" % BUG_REPORT) raise ValueError("Error parsing Cloudflare IUAM Javascript challenge. %s" % BUG_REPORT)
# 2019-03-20: Cloudflare sometimes stores part of the challenge in a div which is later
# added using document.getElementById(x).innerHTML, so it is necessary to simulate that
# method and value.
try:
# Find the id of the div in the javascript code.
k = re.search(r"k\s+=\s+'([^']+)';", body).group(1)
# Find the div with that id and store its content.
val = re.search(r'<div(.*)id="%s"(.*)>(.*)</div>' % (k), body).group(3)
except Exception:
# If not available, either the code has been modified again, or the old
# style challenge is used.
k = ''
val = ''
# Use vm.runInNewContext to safely evaluate code # Use vm.runInNewContext to safely evaluate code
# The sandboxed code cannot use the Node.js standard library # The sandboxed code cannot use the Node.js standard library
js = "console.log(require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000}));" % js # Add the atob method which is now used by Cloudflares code, but is not available in all node versions.
simulate_document_js = 'var document= {getElementById: function(x) { return {innerHTML:"%s"};}}' % (val)
atob_js = 'var atob = function(str) {return Buffer.from(str, "base64").toString("binary");}'
# t is not defined, so we have to define it and set it to the domain name.
js = '%s;%s;var t="%s";%s' % (simulate_document_js,atob_js,domain,js)
buffer_js = "var Buffer = require('buffer').Buffer"
# Pass Buffer into the new context, so it is available for atob.
js = "%s;console.log(require('vm').runInNewContext('%s', {'Buffer':Buffer,'g':String.fromCharCode}, {timeout: 5000}));" % (buffer_js, js)
try: try:
result = subprocess.check_output(["node", "-e", js]).strip() result = subprocess.check_output(["node", "-e", js]).strip()