diff options
| author | 2021-03-13 16:26:30 -0500 | |
|---|---|---|
| committer | 2021-03-13 16:26:30 -0500 | |
| commit | 3201d77a148367d739862b4f07868a76eaeb7cb1 (patch) | |
| tree | 78b8d71633ec000672a84ad0bbbddd0513ae2d30 /gallery_dl/extractor/common.py | |
| parent | fc83315c164afd74734adf27e0f7fec2011904aa (diff) | |
New upstream version 1.17.0.upstream/1.17.0
Diffstat (limited to 'gallery_dl/extractor/common.py')
| -rw-r--r-- | gallery_dl/extractor/common.py | 293 |
1 files changed, 180 insertions, 113 deletions
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 15cc776..e9b9718 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2020 Mike Fährmann +# Copyright 2014-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ """Common classes and constants used by extractor modules.""" import re +import ssl import time import netrc import queue @@ -16,8 +17,9 @@ import logging import datetime import requests import threading +from requests.adapters import HTTPAdapter from .message import Message -from .. import config, text, util, exception, cloudflare +from .. import config, text, util, exception class Extractor(): @@ -30,6 +32,7 @@ class Extractor(): filename_fmt = "{filename}.{extension}" archive_fmt = "" cookiedomain = "" + browser = None root = "" test = None request_interval = 0.0 @@ -37,15 +40,15 @@ class Extractor(): request_timestamp = 0.0 def __init__(self, match): - self.session = requests.Session() self.log = logging.getLogger(self.category) self.url = match.string - self._cookiefile = None - self._cookiejar = self.session.cookies + if self.basecategory: + self.config = self._config_shared + self.config_accumulate = self._config_shared_accumulate + self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" - self._cfgpath = ("extractor", self.category, self.subcategory) self._write_pages = self.config("write-pages", False) self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) @@ -58,11 +61,7 @@ class Extractor(): if self.request_interval < self.request_interval_min: self.request_interval = self.request_interval_min - if self.basecategory: - self.config = self._config_shared - self.config_accumulate = self._config_shared_accumulate - - self._init_headers() + self._init_session() self._init_cookies() self._init_proxies() @@ -140,21 +139,20 @@ class Extractor(): if notfound and code == 404: raise exception.NotFoundError(notfound) - reason = response.reason - if cloudflare.is_challenge(response): - self.log.info("Solving Cloudflare challenge") - response, domain, cookies = cloudflare.solve_challenge( - session, response, kwargs) - if cookies: - cloudflare.cookies.update( - self.category, (domain, cookies)) - return response - if cloudflare.is_captcha(response): - self.log.warning("Cloudflare CAPTCHA") - - msg = "'{} {}' for '{}'".format(code, reason, url) + msg = "'{} {}' for '{}'".format(code, response.reason, url) + server = response.headers.get("Server") + if server and server.startswith("cloudflare"): + if code == 503 and \ + b"jschl-answer" in response.content: + self.log.warning("Cloudflare IUAM challenge") + break + if code == 403 and \ + b'name="captcha-bypass"' in response.content: + self.log.warning("Cloudflare CAPTCHA") + break if code < 500 and code != 429 and code != 430: break + finally: Extractor.request_timestamp = time.time() @@ -212,19 +210,46 @@ class Extractor(): return username, password - def _init_headers(self): - """Initialize HTTP headers for the 'session' object""" - headers = self.session.headers + def _init_session(self): + self.session = session = requests.Session() + headers = session.headers headers.clear() - headers["User-Agent"] = self.config( - "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:68.0) " - "Gecko/20100101 Firefox/68.0")) - headers["Accept"] = "*/*" - headers["Accept-Language"] = "en-US,en;q=0.5" - headers["Accept-Encoding"] = "gzip, deflate" - headers["Connection"] = "keep-alive" - headers["Upgrade-Insecure-Requests"] = "1" + browser = self.config("browser") or self.browser + if browser: + browser, _, platform = browser.lower().partition(":") + + if not platform or platform == "auto": + platform = ("Windows NT 10.0; Win64; x64" + if util.WINDOWS else "X11; Linux x86_64") + elif platform == "windows": + platform = "Windows NT 10.0; Win64; x64" + elif platform == "linux": + platform = "X11; Linux x86_64" + elif platform == "macos": + platform = "Macintosh; Intel Mac OS X 11.2" + + if browser == "chrome": + _emulate_browser_chrome(session, platform) + else: + _emulate_browser_firefox(session, platform) + else: + headers["User-Agent"] = self.config("user-agent", ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; " + "rv:78.0) Gecko/20100101 Firefox/78.0")) + headers["Accept"] = "*/*" + headers["Accept-Language"] = "en-US,en;q=0.5" + headers["Accept-Encoding"] = "gzip, deflate" + + custom_headers = self.config("headers") + if custom_headers: + headers.update(custom_headers) + + ciphers = self.config("ciphers") + if ciphers: + if isinstance(ciphers, list): + ciphers = ":".join(ciphers) + session.mount("https://", HTTPSAdapter(ciphers)) def _init_proxies(self): """Update the session's proxy map""" @@ -242,6 +267,8 @@ class Extractor(): def _init_cookies(self): """Populate the session's cookiejar""" + self._cookiefile = None + self._cookiejar = self.session.cookies if self.cookiedomain is None: return @@ -264,11 +291,6 @@ class Extractor(): "expected 'dict' or 'str' value for 'cookies' option, " "got '%s' (%s)", cookies.__class__.__name__, cookies) - cookies = cloudflare.cookies(self.category) - if cookies: - domain, cookies = cookies - self._update_cookies_dict(cookies, domain) - def _store_cookies(self): """Store the session's cookiejar in a cookies.txt file""" if self._cookiefile and self.config("cookies-update", True): @@ -527,46 +549,126 @@ class AsynchronousMixin(): messages.put(None) -def generate_extractors(extractor_data, symtable, classes): - """Dynamically generate Extractor classes""" - extractors = config.get(("extractor",), classes[0].basecategory) - ckey = extractor_data.get("_ckey") - prev = None - - if extractors: - extractor_data.update(extractors) - - for category, info in extractor_data.items(): - - if not isinstance(info, dict) or "root" not in info: - continue - - root = info["root"] - domain = root[root.index(":") + 3:] - pattern = info.get("pattern") or re.escape(domain) - name = (info.get("name") or category).capitalize() - - for cls in classes: - - class Extr(cls): - pass - Extr.__module__ = cls.__module__ - Extr.__name__ = Extr.__qualname__ = \ - name + cls.subcategory.capitalize() + "Extractor" - Extr.__doc__ = \ - "Extractor for " + cls.subcategory + "s from " + domain - Extr.category = category - Extr.pattern = r"(?:https?://)?" + pattern + cls.pattern_fmt - Extr.test = info.get("test-" + cls.subcategory) - Extr.root = root +class BaseExtractor(Extractor): + instances = () - if "extra" in info: - for key, value in info["extra"].items(): - setattr(Extr, key, value) - if prev and ckey: - setattr(Extr, ckey, prev) + def __init__(self, match): + if not self.category: + for index, group in enumerate(match.groups()): + if group is not None: + self.category, self.root = self.instances[index] + break + Extractor.__init__(self, match) - symtable[Extr.__name__] = prev = Extr + @classmethod + def update(cls, instances): + extra_instances = config.get(("extractor",), cls.basecategory) + if extra_instances: + for category, info in extra_instances.items(): + if isinstance(info, dict) and "root" in info: + instances[category] = info + + pattern_list = [] + instance_list = cls.instances = [] + for category, info in instances.items(): + root = info["root"].rstrip("/") + instance_list.append((category, root)) + + pattern = info.get("pattern") + if not pattern: + pattern = re.escape(root[root.index(":") + 3:]) + pattern_list.append(pattern + "()") + + return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")" + + +class HTTPSAdapter(HTTPAdapter): + + def __init__(self, ciphers): + context = self.ssl_context = ssl.create_default_context() + context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | + ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) + context.set_ecdh_curve("prime256v1") + context.set_ciphers(ciphers) + HTTPAdapter.__init__(self) + + def init_poolmanager(self, *args, **kwargs): + kwargs["ssl_context"] = self.ssl_context + return HTTPAdapter.init_poolmanager(self, *args, **kwargs) + + def proxy_manager_for(self, *args, **kwargs): + kwargs["ssl_context"] = self.ssl_context + return HTTPAdapter.proxy_manager_for(self, *args, **kwargs) + + +def _emulate_browser_firefox(session, platform): + headers = session.headers + headers["User-Agent"] = ("Mozilla/5.0 (" + platform + "; rv:78.0) " + "Gecko/20100101 Firefox/78.0") + headers["Accept"] = ("text/html,application/xhtml+xml," + "application/xml;q=0.9,image/webp,*/*;q=0.8") + headers["Accept-Language"] = "en-US,en;q=0.5" + headers["Accept-Encoding"] = "gzip, deflate" + headers["Referer"] = None + headers["Upgrade-Insecure-Requests"] = "1" + headers["Cookie"] = None + + session.mount("https://", HTTPSAdapter( + "TLS_AES_128_GCM_SHA256:" + "TLS_CHACHA20_POLY1305_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-AES256-SHA:" + "ECDHE-ECDSA-AES128-SHA:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "DHE-RSA-AES128-SHA:" + "DHE-RSA-AES256-SHA:" + "AES128-SHA:" + "AES256-SHA:" + "DES-CBC3-SHA" + )) + + +def _emulate_browser_chrome(session, platform): + if platform.startswith("Macintosh"): + platform = platform.replace(".", "_") + "_0" + + headers = session.headers + headers["Upgrade-Insecure-Requests"] = "1" + headers["User-Agent"] = ( + "Mozilla/5.0 (" + platform + ") AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36") + headers["Accept"] = ("text/html,application/xhtml+xml,application/xml;" + "q=0.9,image/webp,image/apng,*/*;q=0.8") + headers["Referer"] = None + headers["Accept-Encoding"] = "gzip, deflate" + headers["Accept-Language"] = "en-US,en;q=0.9" + headers["Cookie"] = None + + session.mount("https://", HTTPSAdapter( + "TLS_AES_128_GCM_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "TLS_CHACHA20_POLY1305_SHA256:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "AES128-GCM-SHA256:" + "AES256-GCM-SHA384:" + "AES128-SHA:" + "AES256-SHA:" + "DES-CBC3-SHA" + )) # Undo automatic pyOpenSSL injection by requests @@ -578,38 +680,3 @@ if not pyopenssl: except ImportError: pass del pyopenssl - - -# Replace urllib3's default cipher list to avoid Cloudflare CAPTCHAs -ciphers = config.get((), "ciphers", True) -if ciphers: - - if ciphers is True: - ciphers = ( - # Firefox's list - "TLS_AES_128_GCM_SHA256:" - "TLS_CHACHA20_POLY1305_SHA256:" - "TLS_AES_256_GCM_SHA384:" - "ECDHE-ECDSA-AES128-GCM-SHA256:" - "ECDHE-RSA-AES128-GCM-SHA256:" - "ECDHE-ECDSA-CHACHA20-POLY1305:" - "ECDHE-RSA-CHACHA20-POLY1305:" - "ECDHE-ECDSA-AES256-GCM-SHA384:" - "ECDHE-RSA-AES256-GCM-SHA384:" - "ECDHE-ECDSA-AES256-SHA:" - "ECDHE-ECDSA-AES128-SHA:" - "ECDHE-RSA-AES128-SHA:" - "ECDHE-RSA-AES256-SHA:" - "DHE-RSA-AES128-SHA:" - "DHE-RSA-AES256-SHA:" - "AES128-SHA:" - "AES256-SHA:" - "DES-CBC3-SHA" - ) - elif isinstance(ciphers, list): - ciphers = ":".join(ciphers) - - from requests.packages.urllib3.util import ssl_ # noqa - ssl_.DEFAULT_CIPHERS = ciphers - del ssl_ -del ciphers |
