summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/common.py
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@ubuntu.com>2019-07-20 05:51:44 -0400
committerLibravatarUnit 193 <unit193@ubuntu.com>2019-07-20 05:51:44 -0400
commit2a63a9c9b7032a76894c48ac4d9cea732fcaee49 (patch)
tree3d5f633ff69cd393036a3dabc4d4533c8484f9ad /gallery_dl/extractor/common.py
parent195c45911e79c33cf0bb986721365fb06df5a153 (diff)
New upstream version 1.9.0upstream/1.9.0
Diffstat (limited to 'gallery_dl/extractor/common.py')
-rw-r--r--gallery_dl/extractor/common.py117
1 files changed, 76 insertions, 41 deletions
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 175af63..5c40e2a 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -13,6 +13,7 @@ import time
import netrc
import queue
import logging
+import datetime
import requests
import threading
import http.cookiejar
@@ -39,10 +40,13 @@ class Extractor():
self._init_headers()
self._init_cookies()
self._init_proxies()
- self._retries = self.config("retries", 5)
+ self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
+ if self._retries < 0:
+ self._retries = float("inf")
+
@classmethod
def from_url(cls, url):
if isinstance(cls.pattern, str):
@@ -63,11 +67,11 @@ class Extractor():
return config.interpolate(
("extractor", self.category, self.subcategory, key), default)
- def request(self, url, method="GET", *, session=None,
- encoding=None, expect=(), retries=None, **kwargs):
- tries = 0
- retries = retries or self._retries
- session = session or self.session
+ def request(self, url, method="GET", *, session=None, retries=None,
+ encoding=None, fatal=True, notfound=None, **kwargs):
+ tries = 1
+ retries = self._retries if retries is None else retries
+ session = self.session if session is None else session
kwargs.setdefault("timeout", self._timeout)
kwargs.setdefault("verify", self._verify)
@@ -83,26 +87,37 @@ class Extractor():
raise exception.HttpError(exc)
else:
code = response.status_code
- if 200 <= code < 400 or code in expect:
+ if 200 <= code < 400 or not fatal and \
+ (400 <= code < 429 or 431 <= code < 500):
if encoding:
response.encoding = encoding
return response
+ if notfound and code == 404:
+ raise exception.NotFoundError(notfound)
if cloudflare.is_challenge(response):
self.log.info("Solving Cloudflare challenge")
url, domain, cookies = cloudflare.solve_challenge(
session, response, kwargs)
cloudflare.cookies.update(self.category, (domain, cookies))
continue
+ if cloudflare.is_captcha(response):
+ try:
+ import OpenSSL # noqa
+ except ImportError:
+ msg = " - Install 'pyOpenSSL' and try again"
+ else:
+ msg = ""
+ self.log.warning("Cloudflare CAPTCHA" + msg)
msg = "{}: {} for url: {}".format(code, response.reason, url)
- if code < 500 and code != 429:
+ if code < 500 and code != 429 and code != 430:
break
- tries += 1
- self.log.debug("%s (%d/%d)", msg, tries, retries)
- if tries >= retries:
+ self.log.debug("%s (%s/%s)", msg, tries, retries+1)
+ if tries > retries:
break
- time.sleep(2 ** tries)
+ time.sleep(min(2 ** (tries-1), 1800))
+ tries += 1
raise exception.HttpError(msg)
@@ -130,8 +145,8 @@ class Extractor():
headers.clear()
headers["User-Agent"] = self.config(
- "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:62.0) "
- "Gecko/20100101 Firefox/62.0"))
+ "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:68.0) "
+ "Gecko/20100101 Firefox/68.0"))
headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5"
headers["Accept-Encoding"] = "gzip, deflate"
@@ -203,6 +218,20 @@ class Extractor():
return False
return True
+ def _get_date_min_max(self, dmin=None, dmax=None):
+ """Retrieve and parse 'date-min' and 'date-max' config values"""
+ def get(key, default):
+ ts = self.config(key, default)
+ if isinstance(ts, str):
+ try:
+ ts = int(datetime.datetime.strptime(ts, fmt).timestamp())
+ except ValueError as exc:
+ self.log.warning("Unable to parse '%s': %s", key, exc)
+ ts = default
+ return ts
+ fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S")
+ return get("date-min", dmin), get("date-max", dmax)
+
@classmethod
def _get_tests(cls):
"""Yield an extractor's test cases as (URL, RESULTS) tuples"""
@@ -403,30 +432,36 @@ def generate_extractors(extractor_data, symtable, classes):
http.cookiejar.MozillaCookieJar.magic_re = re.compile(
"#( Netscape)? HTTP Cookie File", re.IGNORECASE)
-# Update default cipher list of urllib3
-# to fix issues with Cloudflare and, by extension, Artstation (#227)
-from requests.packages.urllib3.util import ssl_ # noqa
-logging.getLogger("gallery-dl").debug("updating default urllib3 ciphers")
-
-# cipher list taken from urllib3 1.25
-# https://github.com/urllib3/urllib3/blob/1.25/src/urllib3/util/ssl_.py
-# with additions from
-# https://github.com/Anorov/cloudflare-scrape/pull/242
-ssl_.DEFAULT_CIPHERS = (
- "ECDHE+AESGCM:"
- "ECDHE+CHACHA20:"
- "DHE+AESGCM:"
- "DHE+CHACHA20:"
- "ECDH+AESGCM:"
- "DH+AESGCM:"
- "ECDH+AES:"
- "DH+AES:"
- "RSA+AESGCM:"
- "RSA+AES:"
- "!ECDHE+SHA:"
- "!AES128-SHA:"
- "!aNULL:"
- "!eNULL:"
- "!MD5:"
- "!DSS"
-)
+# Replace default cipher list of urllib3 to avoid Cloudflare CAPTCHAs
+ciphers = config.get(("ciphers",), True)
+if ciphers:
+ logging.getLogger("gallery-dl").debug("Updating urllib3 ciphers")
+
+ if ciphers is True:
+ ciphers = (
+ # Firefox's list
+ "TLS_AES_128_GCM_SHA256:"
+ "TLS_CHACHA20_POLY1305_SHA256:"
+ "TLS_AES_256_GCM_SHA384:"
+ "ECDHE-ECDSA-AES128-GCM-SHA256:"
+ "ECDHE-RSA-AES128-GCM-SHA256:"
+ "ECDHE-ECDSA-CHACHA20-POLY1305:"
+ "ECDHE-RSA-CHACHA20-POLY1305:"
+ "ECDHE-ECDSA-AES256-GCM-SHA384:"
+ "ECDHE-RSA-AES256-GCM-SHA384:"
+ "ECDHE-ECDSA-AES256-SHA:"
+ "ECDHE-ECDSA-AES128-SHA:"
+ "ECDHE-RSA-AES128-SHA:"
+ "ECDHE-RSA-AES256-SHA:"
+ "DHE-RSA-AES128-SHA:"
+ "DHE-RSA-AES256-SHA:"
+ "AES128-SHA:"
+ "AES256-SHA:"
+ "DES-CBC3-SHA"
+ )
+ elif isinstance(ciphers, list):
+ ciphers = ":".join(ciphers)
+
+ from requests.packages.urllib3.util import ssl_ # noqa
+ ssl_.DEFAULT_CIPHERS = ciphers
+ del ssl_