diff options
Diffstat (limited to 'gallery_dl/cloudflare.py')
| -rw-r--r-- | gallery_dl/cloudflare.py | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py new file mode 100644 index 0000000..b9bf32d --- /dev/null +++ b/gallery_dl/cloudflare.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Methods to access sites behind Cloudflare protection""" + +import re +import time +import operator +import collections +import urllib.parse +from . import text, exception +from .cache import memcache + + +def is_challenge(response): + return (response.status_code == 503 and + response.headers.get("Server", "").startswith("cloudflare") and + b"jschl-answer" in response.content) + + +def is_captcha(response): + return (response.status_code == 403 and + b'name="captcha-bypass"' in response.content) + + +def solve_challenge(session, response, kwargs): + """Solve Cloudflare challenge and get cfclearance cookie""" + parsed = urllib.parse.urlsplit(response.url) + root = parsed.scheme + "://" + parsed.netloc + + cf_kwargs = {} + headers = cf_kwargs["headers"] = collections.OrderedDict() + params = cf_kwargs["params"] = collections.OrderedDict() + + page = response.text + params["s"] = text.extract(page, 'name="s" value="', '"')[0] + params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0] + params["pass"] = text.extract(page, 'name="pass" value="', '"')[0] + params["jschl_answer"] = solve_js_challenge(page, parsed.netloc) + headers["Referer"] = response.url + + time.sleep(4) + + url = root + "/cdn-cgi/l/chk_jschl" + cf_kwargs["allow_redirects"] = False + cf_response = session.request("GET", url, **cf_kwargs) + + location = cf_response.headers.get("Location") + if not location: + import logging + log = logging.getLogger("cloudflare") + rtype = "CAPTCHA" if is_captcha(cf_response) else "Unexpected" + log.error("%s response", rtype) + log.debug("Headers:\n%s", cf_response.headers) + log.debug("Content:\n%s", cf_response.text) + raise exception.StopExtraction() + + if location[0] == "/": + location = root + location + else: + location = re.sub(r"(https?):/(?!/)", r"\1://", location) + + for cookie in cf_response.cookies: + if cookie.name == "cf_clearance": + return location, cookie.domain, { + cookie.name: cookie.value, + "__cfduid" : response.cookies.get("__cfduid", ""), + } + return location, "", {} + + +def solve_js_challenge(page, netloc): + """Evaluate JS challenge in 'page' to get 'jschl_answer' value""" + + # build variable name + # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk + data, pos = text.extract_all(page, ( + ('var' , ',f, ', '='), + ('key' , '"' , '"'), + ('expr', ':' , '}'), + )) + variable = "{}.{}".format(data["var"], data["key"]) + vlength = len(variable) + + # evaluate the initial expression + solution = evaluate_expression(data["expr"], page, netloc) + + # iterator over all remaining expressions + # and combine their values in 'solution' + expressions = text.extract( + page, "'challenge-form');", "f.submit();", pos)[0] + for expr in expressions.split(";")[1:]: + + if expr.startswith(variable): + # select arithmetc function based on operator (+/-/*) + func = OPERATORS[expr[vlength]] + # evaluate the rest of the expression + value = evaluate_expression(expr[vlength+2:], page, netloc) + # combine expression value with our current solution + solution = func(solution, value) + + elif expr.startswith("a.value"): + if "t.length)" in expr: + # add length of hostname + solution += len(netloc) + if ".toFixed(" in expr: + # trim solution to 10 decimal places + # and strip trailing zeros + solution = "{:.10f}".format(solution).rstrip("0") + return solution + + +def evaluate_expression(expr, page, netloc, *, + split_re=re.compile(r"[(+]+([^)]*)\)")): + """Evaluate a single Javascript expression for the challenge""" + + if expr.startswith("function(p)"): + # get HTML element with ID k and evaluate the expression inside + # 'eval(eval("document.getElementById(k).innerHTML"))' + k, pos = text.extract(page, "k = '", "'") + e, pos = text.extract(page, 'id="'+k+'"', '<') + return evaluate_expression(e.partition(">")[2], page, netloc) + + if "/" in expr: + # split the expression in numerator and denominator subexpressions, + # evaluate them separately, + # and return their fraction-result + num, _, denom = expr.partition("/") + num = evaluate_expression(num, page, netloc) + denom = evaluate_expression(denom, page, netloc) + return num / denom + + if "function(p)" in expr: + # split initial expression and function code + initial, _, func = expr.partition("function(p)") + # evaluate said expression + initial = evaluate_expression(initial, page, netloc) + # get function argument and use it as index into 'netloc' + index = evaluate_expression(func[func.index("}")+1:], page, netloc) + return initial + ord(netloc[int(index)]) + + # iterate over all subexpressions, + # evaluate them, + # and accumulate their values in 'result' + result = "" + for subexpr in split_re.findall(expr) or (expr,): + result += str(sum( + VALUES[part] + for part in subexpr.split("[]") + )) + return int(result) + + +OPERATORS = { + "+": operator.add, + "-": operator.sub, + "*": operator.mul, +} + +VALUES = { + "": 0, + "+": 0, + "!+": 1, + "!!": 1, + "+!!": 1, +} + + +@memcache(keyarg=0) +def cookies(category): + return None |
