diff options
| author | 2019-07-20 05:51:44 -0400 | |
|---|---|---|
| committer | 2019-07-20 05:51:44 -0400 | |
| commit | 2a63a9c9b7032a76894c48ac4d9cea732fcaee49 (patch) | |
| tree | 3d5f633ff69cd393036a3dabc4d4533c8484f9ad /gallery_dl/extractor/common.py | |
| parent | 195c45911e79c33cf0bb986721365fb06df5a153 (diff) | |
New upstream version 1.9.0upstream/1.9.0
Diffstat (limited to 'gallery_dl/extractor/common.py')
| -rw-r--r-- | gallery_dl/extractor/common.py | 117 |
1 files changed, 76 insertions, 41 deletions
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 175af63..5c40e2a 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -13,6 +13,7 @@ import time import netrc import queue import logging +import datetime import requests import threading import http.cookiejar @@ -39,10 +40,13 @@ class Extractor(): self._init_headers() self._init_cookies() self._init_proxies() - self._retries = self.config("retries", 5) + self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) self._verify = self.config("verify", True) + if self._retries < 0: + self._retries = float("inf") + @classmethod def from_url(cls, url): if isinstance(cls.pattern, str): @@ -63,11 +67,11 @@ class Extractor(): return config.interpolate( ("extractor", self.category, self.subcategory, key), default) - def request(self, url, method="GET", *, session=None, - encoding=None, expect=(), retries=None, **kwargs): - tries = 0 - retries = retries or self._retries - session = session or self.session + def request(self, url, method="GET", *, session=None, retries=None, + encoding=None, fatal=True, notfound=None, **kwargs): + tries = 1 + retries = self._retries if retries is None else retries + session = self.session if session is None else session kwargs.setdefault("timeout", self._timeout) kwargs.setdefault("verify", self._verify) @@ -83,26 +87,37 @@ class Extractor(): raise exception.HttpError(exc) else: code = response.status_code - if 200 <= code < 400 or code in expect: + if 200 <= code < 400 or not fatal and \ + (400 <= code < 429 or 431 <= code < 500): if encoding: response.encoding = encoding return response + if notfound and code == 404: + raise exception.NotFoundError(notfound) if cloudflare.is_challenge(response): self.log.info("Solving Cloudflare challenge") url, domain, cookies = cloudflare.solve_challenge( session, response, kwargs) cloudflare.cookies.update(self.category, (domain, cookies)) continue + if cloudflare.is_captcha(response): + try: + import OpenSSL # noqa + except ImportError: + msg = " - Install 'pyOpenSSL' and try again" + else: + msg = "" + self.log.warning("Cloudflare CAPTCHA" + msg) msg = "{}: {} for url: {}".format(code, response.reason, url) - if code < 500 and code != 429: + if code < 500 and code != 429 and code != 430: break - tries += 1 - self.log.debug("%s (%d/%d)", msg, tries, retries) - if tries >= retries: + self.log.debug("%s (%s/%s)", msg, tries, retries+1) + if tries > retries: break - time.sleep(2 ** tries) + time.sleep(min(2 ** (tries-1), 1800)) + tries += 1 raise exception.HttpError(msg) @@ -130,8 +145,8 @@ class Extractor(): headers.clear() headers["User-Agent"] = self.config( - "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:62.0) " - "Gecko/20100101 Firefox/62.0")) + "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:68.0) " + "Gecko/20100101 Firefox/68.0")) headers["Accept"] = "*/*" headers["Accept-Language"] = "en-US,en;q=0.5" headers["Accept-Encoding"] = "gzip, deflate" @@ -203,6 +218,20 @@ class Extractor(): return False return True + def _get_date_min_max(self, dmin=None, dmax=None): + """Retrieve and parse 'date-min' and 'date-max' config values""" + def get(key, default): + ts = self.config(key, default) + if isinstance(ts, str): + try: + ts = int(datetime.datetime.strptime(ts, fmt).timestamp()) + except ValueError as exc: + self.log.warning("Unable to parse '%s': %s", key, exc) + ts = default + return ts + fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S") + return get("date-min", dmin), get("date-max", dmax) + @classmethod def _get_tests(cls): """Yield an extractor's test cases as (URL, RESULTS) tuples""" @@ -403,30 +432,36 @@ def generate_extractors(extractor_data, symtable, classes): http.cookiejar.MozillaCookieJar.magic_re = re.compile( "#( Netscape)? HTTP Cookie File", re.IGNORECASE) -# Update default cipher list of urllib3 -# to fix issues with Cloudflare and, by extension, Artstation (#227) -from requests.packages.urllib3.util import ssl_ # noqa -logging.getLogger("gallery-dl").debug("updating default urllib3 ciphers") - -# cipher list taken from urllib3 1.25 -# https://github.com/urllib3/urllib3/blob/1.25/src/urllib3/util/ssl_.py -# with additions from -# https://github.com/Anorov/cloudflare-scrape/pull/242 -ssl_.DEFAULT_CIPHERS = ( - "ECDHE+AESGCM:" - "ECDHE+CHACHA20:" - "DHE+AESGCM:" - "DHE+CHACHA20:" - "ECDH+AESGCM:" - "DH+AESGCM:" - "ECDH+AES:" - "DH+AES:" - "RSA+AESGCM:" - "RSA+AES:" - "!ECDHE+SHA:" - "!AES128-SHA:" - "!aNULL:" - "!eNULL:" - "!MD5:" - "!DSS" -) +# Replace default cipher list of urllib3 to avoid Cloudflare CAPTCHAs +ciphers = config.get(("ciphers",), True) +if ciphers: + logging.getLogger("gallery-dl").debug("Updating urllib3 ciphers") + + if ciphers is True: + ciphers = ( + # Firefox's list + "TLS_AES_128_GCM_SHA256:" + "TLS_CHACHA20_POLY1305_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-AES256-SHA:" + "ECDHE-ECDSA-AES128-SHA:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "DHE-RSA-AES128-SHA:" + "DHE-RSA-AES256-SHA:" + "AES128-SHA:" + "AES256-SHA:" + "DES-CBC3-SHA" + ) + elif isinstance(ciphers, list): + ciphers = ":".join(ciphers) + + from requests.packages.urllib3.util import ssl_ # noqa + ssl_.DEFAULT_CIPHERS = ciphers + del ssl_ |
