Update cfscrape to latest version

https://github.com/Anorov/cloudflare-scrape
This commit is contained in:
Undeadhunter 2016-12-28 10:47:09 +01:00 committed by evilhero
parent cbcb12516e
commit 6101943540
1 changed files with 21 additions and 13 deletions

View File

@ -2,10 +2,9 @@ from time import sleep
import logging
import random
import re
import os
from requests.sessions import Session
import js2py
from js2py import eval_js
from copy import deepcopy
try:
from urlparse import urlparse
@ -35,13 +34,17 @@ class CloudflareScraper(Session):
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
# Check if Cloudflare anti-bot is on
if resp.status_code == 503 and resp.headers.get("Server") == "cloudflare-nginx":
if ( resp.status_code == 503
and resp.headers.get("Server") == "cloudflare-nginx"
and b"jschl_vc" in resp.content
and b"jschl_answer" in resp.content
):
return self.solve_cf_challenge(resp, **kwargs)
# Otherwise, no Cloudflare anti-bot detected
return resp
def solve_cf_challenge(self, resp, **kwargs):
def solve_cf_challenge(self, resp, **original_kwargs):
sleep(5) # Cloudflare requires a delay before solving the challenge
body = resp.text
@ -49,8 +52,9 @@ class CloudflareScraper(Session):
domain = urlparse(resp.url).netloc
submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain)
params = kwargs.setdefault("params", {})
headers = kwargs.setdefault("headers", {})
cloudflare_kwargs = deepcopy(original_kwargs)
params = cloudflare_kwargs.setdefault("params", {})
headers = cloudflare_kwargs.setdefault("headers", {})
headers["Referer"] = resp.url
try:
@ -73,10 +77,15 @@ class CloudflareScraper(Session):
raise
# Safely evaluate the Javascript expression
js = js.replace('return', '')
params["jschl_answer"] = str(int(eval_js(js)) + len(domain))
params["jschl_answer"] = str(int(js2py.eval_js(js)) + len(domain))
return self.get(submit_url, **kwargs)
# Requests transforms any request into a GET after a redirect,
# so the redirect has to be handled manually here to allow for
# performing other types of requests even as the first request.
method = resp.request.method
cloudflare_kwargs["allow_redirects"] = False
redirect = self.request(method, submit_url, **cloudflare_kwargs)
return self.request(method, redirect.headers["Location"], **original_kwargs)
def extract_js(self, body):
js = re.search(r"setTimeout\(function\(\){\s+(var "
@ -88,14 +97,13 @@ class CloudflareScraper(Session):
# These characters are not currently used in Cloudflare's arithmetic snippet
js = re.sub(r"[\n\\']", "", js)
return js.replace("parseInt", "return parseInt")
return js
@classmethod
def create_scraper(cls, sess=None, **kwargs):
"""
Convenience function for creating a ready-to-go requests.Session (subclass) object.
"""
scraper = cls()
if sess:
@ -131,7 +139,7 @@ class CloudflareScraper(Session):
cookie_domain = d
break
else:
raise ValueError("Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM mode enabled?")
raise ValueError("Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?")
return ({
"__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain),
@ -150,4 +158,4 @@ class CloudflareScraper(Session):
create_scraper = CloudflareScraper.create_scraper
get_tokens = CloudflareScraper.get_tokens
get_cookie_string = CloudflareScraper.get_cookie_string
get_cookie_string = CloudflareScraper.get_cookie_string