Update cfscrape to latest version

https://github.com/Anorov/cloudflare-scrape
This commit is contained in:
Undeadhunter 2016-12-28 10:47:09 +01:00 committed by evilhero
parent cbcb12516e
commit 6101943540
1 changed files with 21 additions and 13 deletions

View File

@ -2,10 +2,9 @@ from time import sleep
import logging import logging
import random import random
import re import re
import os
from requests.sessions import Session from requests.sessions import Session
import js2py import js2py
from js2py import eval_js from copy import deepcopy
try: try:
from urlparse import urlparse from urlparse import urlparse
@ -35,13 +34,17 @@ class CloudflareScraper(Session):
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs) resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
# Check if Cloudflare anti-bot is on # Check if Cloudflare anti-bot is on
if resp.status_code == 503 and resp.headers.get("Server") == "cloudflare-nginx": if ( resp.status_code == 503
and resp.headers.get("Server") == "cloudflare-nginx"
and b"jschl_vc" in resp.content
and b"jschl_answer" in resp.content
):
return self.solve_cf_challenge(resp, **kwargs) return self.solve_cf_challenge(resp, **kwargs)
# Otherwise, no Cloudflare anti-bot detected # Otherwise, no Cloudflare anti-bot detected
return resp return resp
def solve_cf_challenge(self, resp, **kwargs): def solve_cf_challenge(self, resp, **original_kwargs):
sleep(5) # Cloudflare requires a delay before solving the challenge sleep(5) # Cloudflare requires a delay before solving the challenge
body = resp.text body = resp.text
@ -49,8 +52,9 @@ class CloudflareScraper(Session):
domain = urlparse(resp.url).netloc domain = urlparse(resp.url).netloc
submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain) submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain)
params = kwargs.setdefault("params", {}) cloudflare_kwargs = deepcopy(original_kwargs)
headers = kwargs.setdefault("headers", {}) params = cloudflare_kwargs.setdefault("params", {})
headers = cloudflare_kwargs.setdefault("headers", {})
headers["Referer"] = resp.url headers["Referer"] = resp.url
try: try:
@ -73,10 +77,15 @@ class CloudflareScraper(Session):
raise raise
# Safely evaluate the Javascript expression # Safely evaluate the Javascript expression
js = js.replace('return', '') params["jschl_answer"] = str(int(js2py.eval_js(js)) + len(domain))
params["jschl_answer"] = str(int(eval_js(js)) + len(domain))
return self.get(submit_url, **kwargs) # Requests transforms any request into a GET after a redirect,
# so the redirect has to be handled manually here to allow for
# performing other types of requests even as the first request.
method = resp.request.method
cloudflare_kwargs["allow_redirects"] = False
redirect = self.request(method, submit_url, **cloudflare_kwargs)
return self.request(method, redirect.headers["Location"], **original_kwargs)
def extract_js(self, body): def extract_js(self, body):
js = re.search(r"setTimeout\(function\(\){\s+(var " js = re.search(r"setTimeout\(function\(\){\s+(var "
@ -88,14 +97,13 @@ class CloudflareScraper(Session):
# These characters are not currently used in Cloudflare's arithmetic snippet # These characters are not currently used in Cloudflare's arithmetic snippet
js = re.sub(r"[\n\\']", "", js) js = re.sub(r"[\n\\']", "", js)
return js.replace("parseInt", "return parseInt") return js
@classmethod @classmethod
def create_scraper(cls, sess=None, **kwargs): def create_scraper(cls, sess=None, **kwargs):
""" """
Convenience function for creating a ready-to-go requests.Session (subclass) object. Convenience function for creating a ready-to-go requests.Session (subclass) object.
""" """
scraper = cls() scraper = cls()
if sess: if sess:
@ -131,7 +139,7 @@ class CloudflareScraper(Session):
cookie_domain = d cookie_domain = d
break break
else: else:
raise ValueError("Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM mode enabled?") raise ValueError("Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM (\"I'm Under Attack Mode\") enabled?")
return ({ return ({
"__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain), "__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain),
@ -150,4 +158,4 @@ class CloudflareScraper(Session):
create_scraper = CloudflareScraper.create_scraper create_scraper = CloudflareScraper.create_scraper
get_tokens = CloudflareScraper.get_tokens get_tokens = CloudflareScraper.get_tokens
get_cookie_string = CloudflareScraper.get_cookie_string get_cookie_string = CloudflareScraper.get_cookie_string