-
Notifications
You must be signed in to change notification settings - Fork 459
/
Copy path__init__.py
422 lines (353 loc) · 16.5 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
# -*- coding: utf-8 -*-
import logging
import random
import re
import ssl
import subprocess
import copy
import time
import os
from base64 import b64encode
from collections import OrderedDict
from requests.sessions import Session
from requests.adapters import HTTPAdapter
from requests.compat import urlparse, urlunparse
from requests.exceptions import RequestException
from urllib3.util.ssl_ import create_urllib3_context, DEFAULT_CIPHERS
from .user_agents import USER_AGENTS
__version__ = "2.1.1"
DEFAULT_USER_AGENT = random.choice(USER_AGENTS)
DEFAULT_HEADERS = OrderedDict(
(
("Host", None),
("Connection", "keep-alive"),
("Upgrade-Insecure-Requests", "1"),
("User-Agent", DEFAULT_USER_AGENT),
(
"Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
),
("Accept-Language", "en-US,en;q=0.9"),
("Accept-Encoding", "gzip, deflate"),
)
)
BUG_REPORT = """\
Cloudflare may have changed their technique, or there may be a bug in the script.
Please read https://github.com/Anorov/cloudflare-scrape#updates, then file a \
bug report at https://github.com/Anorov/cloudflare-scrape/issues."\
"""
ANSWER_ACCEPT_ERROR = """\
The challenge answer was not properly accepted by Cloudflare. This can occur if \
the target website is under heavy load, or if Cloudflare is experiencing issues. You can
potentially resolve this by increasing the challenge answer delay (default: 8 seconds). \
For example: cfscrape.create_scraper(delay=15)
If increasing the delay does not help, please open a GitHub issue at \
https://github.com/Anorov/cloudflare-scrape/issues\
"""
# Remove a few problematic TLSv1.0 ciphers from the defaults
DEFAULT_CIPHERS += ":!ECDHE+SHA:!AES128-SHA:!AESCCM:!DHE:!ARIA"
class CloudflareAdapter(HTTPAdapter):
""" HTTPS adapter that creates a SSL context with custom ciphers """
def get_connection(self, *args, **kwargs):
conn = super(CloudflareAdapter, self).get_connection(*args, **kwargs)
if conn.conn_kw.get("ssl_context"):
conn.conn_kw["ssl_context"].set_ciphers(DEFAULT_CIPHERS)
else:
context = create_urllib3_context(ciphers=DEFAULT_CIPHERS)
conn.conn_kw["ssl_context"] = context
return conn
class CloudflareError(RequestException):
pass
class CloudflareCaptchaError(CloudflareError):
pass
class CloudflareScraper(Session):
def __init__(self, *args, **kwargs):
self.delay = kwargs.pop("delay", None)
# Use headers with a random User-Agent if no custom headers have been set
headers = OrderedDict(kwargs.pop("headers", DEFAULT_HEADERS))
# Set the User-Agent header if it was not provided
headers.setdefault("User-Agent", DEFAULT_USER_AGENT)
super(CloudflareScraper, self).__init__(*args, **kwargs)
# Define headers to force using an OrderedDict and preserve header order
self.headers = headers
self.org_method = None
self.mount("https://", CloudflareAdapter())
@staticmethod
def is_cloudflare_iuam_challenge(resp):
return (
resp.status_code in (503, 429)
and resp.headers.get("Server", "").startswith("cloudflare")
and b"jschl_vc" in resp.content
and b"jschl_answer" in resp.content
)
@staticmethod
def is_cloudflare_captcha_challenge(resp):
return (
resp.status_code == 403
and resp.headers.get("Server", "").startswith("cloudflare")
and b"/cdn-cgi/l/chk_captcha" in resp.content
)
def request(self, method, url, *args, **kwargs):
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
# Check if Cloudflare captcha challenge is presented
if self.is_cloudflare_captcha_challenge(resp):
self.handle_captcha_challenge(resp, url)
# Check if Cloudflare anti-bot "I'm Under Attack Mode" is enabled
if self.is_cloudflare_iuam_challenge(resp):
resp = self.solve_cf_challenge(resp, **kwargs)
return resp
def cloudflare_is_bypassed(self, url, resp=None):
cookie_domain = ".{}".format(urlparse(url).netloc)
return (
self.cookies.get("cf_clearance", None, domain=cookie_domain) or
(resp and resp.cookies.get("cf_clearance", None, domain=cookie_domain))
)
def handle_captcha_challenge(self, resp, url):
error = (
"Cloudflare captcha challenge presented for %s (cfscrape cannot solve captchas)"
% urlparse(url).netloc
)
if ssl.OPENSSL_VERSION_NUMBER < 0x10101000:
error += ". Your OpenSSL version is lower than 1.1.1. Please upgrade your OpenSSL library and recompile Python."
raise CloudflareCaptchaError(error, response=resp)
def solve_cf_challenge(self, resp, **original_kwargs):
start_time = time.time()
body = resp.text
parsed_url = urlparse(resp.url)
domain = parsed_url.netloc
challenge_form = re.search(r'\<form.*?id=\"challenge-form\".*?\/form\>',body, flags=re.S).group(0) # find challenge form
method = re.search(r'method=\"(.*?)\"', challenge_form, flags=re.S).group(1)
if self.org_method is None:
self.org_method = resp.request.method
submit_url = "%s://%s%s" % (parsed_url.scheme,
domain,
re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')[0])
cloudflare_kwargs = copy.deepcopy(original_kwargs)
headers = cloudflare_kwargs.setdefault("headers", {})
headers["Referer"] = resp.url
try:
cloudflare_kwargs["params"] = dict()
cloudflare_kwargs["data"] = dict()
if len(re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')) != 1:
for param in re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')[1].split('&'):
cloudflare_kwargs["params"].update({param.split('=')[0]:param.split('=')[1]})
for input_ in re.findall(r'\<input.*?(?:\/>|\<\/input\>)', challenge_form, flags=re.S):
if re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1) != 'jschl_answer':
if method == 'POST':
cloudflare_kwargs["data"].update({re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1):
re.search(r'value=\"(.*?)\"',input_, flags=re.S).group(1)})
elif method == 'GET':
cloudflare_kwargs["params"].update({re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1):
re.search(r'value=\"(.*?)\"',input_, flags=re.S).group(1)})
if method == 'POST':
for k in ("jschl_vc", "pass"):
if k not in cloudflare_kwargs["data"]:
raise ValueError("%s is missing from challenge form" % k)
elif method == 'GET':
for k in ("jschl_vc", "pass"):
if k not in cloudflare_kwargs["params"]:
raise ValueError("%s is missing from challenge form" % k)
except Exception as e:
# Something is wrong with the page.
# This may indicate Cloudflare has changed their anti-bot
# technique. If you see this and are running the latest version,
# please open a GitHub issue so I can update the code accordingly.
raise ValueError(
"Unable to parse Cloudflare anti-bot IUAM page: %s %s"
% (e, BUG_REPORT)
)
# Solve the Javascript challenge
answer, delay = self.solve_challenge(body, domain)
if method == 'POST':
cloudflare_kwargs["data"]["jschl_answer"] = answer
elif method == 'GET':
cloudflare_kwargs["params"]["jschl_answer"] = answer
# Requests transforms any request into a GET after a redirect,
# so the redirect has to be handled manually here to allow for
# performing other types of requests even as the first request.
cloudflare_kwargs["allow_redirects"] = False
# Cloudflare requires a delay before solving the challenge
time.sleep(max(delay - (time.time() - start_time), 0))
# Send the challenge response and handle the redirect manually
redirect = self.request(method, submit_url, **cloudflare_kwargs)
if "Location" in redirect.headers:
redirect_location = urlparse(redirect.headers["Location"])
if not redirect_location.netloc:
redirect_url = urlunparse(
(
parsed_url.scheme,
domain,
redirect_location.path,
redirect_location.params,
redirect_location.query,
redirect_location.fragment,
)
)
return self.request(method, redirect_url, **original_kwargs)
return self.request(method, redirect.headers["Location"], **original_kwargs)
elif "Set-Cookie" in redirect.headers:
if 'cf_clearance' in redirect.headers['Set-Cookie']:
resp = self.request(self.org_method, submit_url, cookies = redirect.cookies)
return resp
else:
return self.request(method, submit_url, **original_kwargs)
else:
resp = self.request(self.org_method, submit_url, **cloudflare_kwargs)
return resp
def solve_challenge(self, body, domain):
try:
all_scripts = re.findall(r'\<script type\=\"text\/javascript\"\>\n(.*?)\<\/script\>',body, flags=re.S)
javascript = next(filter(lambda w: "jschl-answer" in w,all_scripts)) #find the script tag which would have obfuscated js
challenge, ms = re.search(
r"setTimeout\(function\(\){\s*(var "
r"s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value\s*=.+?)\r?\n"
r"(?:[^{<>]*},\s*(\d{4,}))?",
javascript, flags=re.S
).groups()
# The challenge requires `document.getElementById` to get this content.
# Future proofing would require escaping newlines and double quotes
innerHTML = ''
for i in javascript.split(';'):
if i.strip().split('=')[0].strip() == 'k': # from what i found out from pld example K var in
k = i.strip().split('=')[1].strip(' \'') # javafunction is for innerHTML this code to find it
innerHTML = re.search(r'\<div.*?id\=\"'+k+r'\".*?\>(.*?)\<\/div\>',body).group(1) #find innerHTML
# Prefix the challenge with a fake document object.
# Interpolate the domain, div contents, and JS challenge.
# The `a.value` to be returned is tacked onto the end.
challenge = """
var document = {
createElement: function () {
return { firstChild: { href: "http://%s/" } }
},
getElementById: function () {
return {"innerHTML": "%s"};
}
};
%s; a.value
""" % (
domain,
innerHTML,
challenge,
)
# Encode the challenge for security while preserving quotes and spacing.
challenge = b64encode(challenge.encode("utf-8")).decode("ascii")
# Use the provided delay, parsed delay, or default to 8 secs
delay = self.delay or (float(ms) / float(1000) if ms else 8)
except Exception:
raise ValueError(
"Unable to identify Cloudflare IUAM Javascript on website. %s"
% BUG_REPORT
)
# Use vm.runInNewContext to safely evaluate code
# The sandboxed code cannot use the Node.js standard library
js = (
"""\
var atob = Object.setPrototypeOf(function (str) {\
try {\
return Buffer.from("" + str, "base64").toString("binary");\
} catch (e) {}\
}, null);\
var challenge = atob("%s");\
var context = Object.setPrototypeOf({ atob: atob }, null);\
var options = {\
filename: "iuam-challenge.js",\
contextOrigin: "cloudflare:iuam-challenge.js",\
contextCodeGeneration: { strings: true, wasm: false },\
timeout: 5000\
};\
process.stdout.write(String(\
require("vm").runInNewContext(challenge, context, options)\
));\
"""
% challenge
)
stderr = ''
try:
node = subprocess.Popen(
["node", "-e", js], stdout=subprocess.PIPE, stderr=subprocess.PIPE,
universal_newlines=True
)
result, stderr = node.communicate()
if node.returncode != 0:
stderr = "Node.js Exception:\n%s" % (stderr or None)
raise subprocess.CalledProcessError(node.returncode, "node -e ...", stderr)
except OSError as e:
if e.errno == 2:
raise EnvironmentError(
"Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cfscrape"
" README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies."
)
raise
except Exception:
logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT)
raise
try:
float(result)
except Exception:
raise ValueError(
"Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT
)
return result, delay
@classmethod
def create_scraper(cls, sess=None, **kwargs):
"""
Convenience function for creating a ready-to-go CloudflareScraper object.
"""
scraper = cls(**kwargs)
if sess:
attrs = [
"auth",
"cert",
"cookies",
"headers",
"hooks",
"params",
"proxies",
"data",
]
for attr in attrs:
val = getattr(sess, attr, None)
if val:
setattr(scraper, attr, val)
return scraper
# Functions for integrating cloudflare-scrape with other applications and scripts
@classmethod
def get_tokens(cls, url, user_agent=None, **kwargs):
scraper = cls.create_scraper()
if user_agent:
scraper.headers["User-Agent"] = user_agent
try:
resp = scraper.get(url, **kwargs)
resp.raise_for_status()
except Exception:
logging.error("'%s' returned an error. Could not collect tokens." % url)
raise
domain = urlparse(resp.url).netloc
cookie_domain = None
for d in scraper.cookies.list_domains():
if d.startswith(".") and d in ("." + domain):
cookie_domain = d
break
else:
raise ValueError(
'Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM ("I\'m Under Attack Mode") enabled?'
)
return (
{
"__cfduid": scraper.cookies.get("__cfduid", "", domain=cookie_domain),
"cf_clearance": scraper.cookies.get(
"cf_clearance", "", domain=cookie_domain
),
},
scraper.headers["User-Agent"],
)
@classmethod
def get_cookie_string(cls, url, user_agent=None, **kwargs):
"""
Convenience function for building a Cookie HTTP header value.
"""
tokens, user_agent = cls.get_tokens(url, user_agent=user_agent, **kwargs)
return "; ".join("=".join(pair) for pair in tokens.items()), user_agent
create_scraper = CloudflareScraper.create_scraper
get_tokens = CloudflareScraper.get_tokens
get_cookie_string = CloudflareScraper.get_cookie_string