summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/common.py')
-rw-r--r--gallery_dl/extractor/common.py196
1 files changed, 102 insertions, 94 deletions
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 5c9b157..0d67df7 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -32,45 +32,21 @@ class Extractor():
directory_fmt = ("{category}",)
filename_fmt = "{filename}.{extension}"
archive_fmt = ""
- cookiedomain = ""
- browser = None
root = ""
- test = None
- finalize = None
+ cookies_domain = ""
+ referer = True
+ tls12 = True
+ browser = None
request_interval = 0.0
request_interval_min = 0.0
request_timestamp = 0.0
- tls12 = True
def __init__(self, match):
self.log = logging.getLogger(self.category)
self.url = match.string
-
- if self.basecategory:
- self.config = self._config_shared
- self.config_accumulate = self._config_shared_accumulate
self._cfgpath = ("extractor", self.category, self.subcategory)
self._parentdir = ""
- self._write_pages = self.config("write-pages", False)
- self._retry_codes = self.config("retry-codes")
- self._retries = self.config("retries", 4)
- self._timeout = self.config("timeout", 30)
- self._verify = self.config("verify", True)
- self._proxies = util.build_proxy_map(self.config("proxy"), self.log)
- self._interval = util.build_duration_func(
- self.config("sleep-request", self.request_interval),
- self.request_interval_min,
- )
-
- if self._retries < 0:
- self._retries = float("inf")
- if not self._retry_codes:
- self._retry_codes = ()
-
- self._init_session()
- self._init_cookies()
-
@classmethod
def from_url(cls, url):
if isinstance(cls.pattern, str):
@@ -79,8 +55,19 @@ class Extractor():
return cls(match) if match else None
def __iter__(self):
+ self.initialize()
return self.items()
+ def initialize(self):
+ self._init_options()
+ self._init_session()
+ self._init_cookies()
+ self._init()
+ self.initialize = util.noop
+
+ def finalize(self):
+ pass
+
def items(self):
yield Message.Version, 1
@@ -109,16 +96,22 @@ class Extractor():
return config.accumulate(self._cfgpath, key)
def _config_shared(self, key, default=None):
- return config.interpolate_common(("extractor",), (
- (self.category, self.subcategory),
- (self.basecategory, self.subcategory),
- ), key, default)
+ return config.interpolate_common(
+ ("extractor",), self._cfgpath, key, default)
def _config_shared_accumulate(self, key):
- values = config.accumulate(self._cfgpath, key)
- conf = config.get(("extractor",), self.basecategory)
- if conf:
- values[:0] = config.accumulate((self.subcategory,), key, conf=conf)
+ first = True
+ extr = ("extractor",)
+
+ for path in self._cfgpath:
+ if first:
+ first = False
+ values = config.accumulate(extr + path, key)
+ else:
+ conf = config.get(extr, path[0])
+ if conf:
+ values[:0] = config.accumulate(
+ (self.subcategory,), key, conf=conf)
return values
def request(self, url, method="GET", session=None,
@@ -245,6 +238,26 @@ class Extractor():
return username, password
+ def _init(self):
+ pass
+
+ def _init_options(self):
+ self._write_pages = self.config("write-pages", False)
+ self._retry_codes = self.config("retry-codes")
+ self._retries = self.config("retries", 4)
+ self._timeout = self.config("timeout", 30)
+ self._verify = self.config("verify", True)
+ self._proxies = util.build_proxy_map(self.config("proxy"), self.log)
+ self._interval = util.build_duration_func(
+ self.config("sleep-request", self.request_interval),
+ self.request_interval_min,
+ )
+
+ if self._retries < 0:
+ self._retries = float("inf")
+ if not self._retry_codes:
+ self._retry_codes = ()
+
def _init_session(self):
self.session = session = requests.Session()
headers = session.headers
@@ -286,7 +299,7 @@ class Extractor():
useragent = self.config("user-agent")
if useragent is None:
useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
- "rv:115.0) Gecko/20100101 Firefox/115.0")
+ "rv:109.0) Gecko/20100101 Firefox/115.0")
elif useragent == "browser":
useragent = _browser_useragent()
headers["User-Agent"] = useragent
@@ -298,6 +311,13 @@ class Extractor():
else:
headers["Accept-Encoding"] = "gzip, deflate"
+ referer = self.config("referer", self.referer)
+ if referer:
+ if isinstance(referer, str):
+ headers["Referer"] = referer
+ elif self.root:
+ headers["Referer"] = self.root + "/"
+
custom_headers = self.config("headers")
if custom_headers:
headers.update(custom_headers)
@@ -330,26 +350,26 @@ class Extractor():
def _init_cookies(self):
"""Populate the session's cookiejar"""
- self._cookiefile = None
- self._cookiejar = self.session.cookies
- if self.cookiedomain is None:
+ self.cookies = self.session.cookies
+ self.cookies_file = None
+ if self.cookies_domain is None:
return
cookies = self.config("cookies")
if cookies:
if isinstance(cookies, dict):
- self._update_cookies_dict(cookies, self.cookiedomain)
+ self.cookies_update_dict(cookies, self.cookies_domain)
elif isinstance(cookies, str):
- cookiefile = util.expand_path(cookies)
+ path = util.expand_path(cookies)
try:
- with open(cookiefile) as fp:
- util.cookiestxt_load(fp, self._cookiejar)
+ with open(path) as fp:
+ util.cookiestxt_load(fp, self.cookies)
except Exception as exc:
self.log.warning("cookies: %s", exc)
else:
self.log.debug("Loading cookies from '%s'", cookies)
- self._cookiefile = cookiefile
+ self.cookies_file = path
elif isinstance(cookies, (list, tuple)):
key = tuple(cookies)
@@ -357,7 +377,7 @@ class Extractor():
if cookiejar is None:
from ..cookies import load_cookies
- cookiejar = self._cookiejar.__class__()
+ cookiejar = self.cookies.__class__()
try:
load_cookies(cookiejar, cookies)
except Exception as exc:
@@ -367,9 +387,9 @@ class Extractor():
else:
self.log.debug("Using cached cookies from %s", key)
- setcookie = self._cookiejar.set_cookie
+ set_cookie = self.cookies.set_cookie
for cookie in cookiejar:
- setcookie(cookie)
+ set_cookie(cookie)
else:
self.log.warning(
@@ -377,8 +397,8 @@ class Extractor():
"option, got '%s' (%s)",
cookies.__class__.__name__, cookies)
- def _store_cookies(self):
- """Store the session's cookiejar in a cookies.txt file"""
+ def cookies_store(self):
+ """Store the session's cookies in a cookies.txt file"""
export = self.config("cookies-update", True)
if not export:
return
@@ -386,47 +406,47 @@ class Extractor():
if isinstance(export, str):
path = util.expand_path(export)
else:
- path = self._cookiefile
+ path = self.cookies_file
if not path:
return
try:
with open(path, "w") as fp:
- util.cookiestxt_store(fp, self._cookiejar)
+ util.cookiestxt_store(fp, self.cookies)
except OSError as exc:
self.log.warning("cookies: %s", exc)
- def _update_cookies(self, cookies, domain=""):
+ def cookies_update(self, cookies, domain=""):
"""Update the session's cookiejar with 'cookies'"""
if isinstance(cookies, dict):
- self._update_cookies_dict(cookies, domain or self.cookiedomain)
+ self.cookies_update_dict(cookies, domain or self.cookies_domain)
else:
- setcookie = self._cookiejar.set_cookie
+ set_cookie = self.cookies.set_cookie
try:
cookies = iter(cookies)
except TypeError:
- setcookie(cookies)
+ set_cookie(cookies)
else:
for cookie in cookies:
- setcookie(cookie)
+ set_cookie(cookie)
- def _update_cookies_dict(self, cookiedict, domain):
+ def cookies_update_dict(self, cookiedict, domain):
"""Update cookiejar with name-value pairs from a dict"""
- setcookie = self._cookiejar.set
+ set_cookie = self.cookies.set
for name, value in cookiedict.items():
- setcookie(name, value, domain=domain)
+ set_cookie(name, value, domain=domain)
- def _check_cookies(self, cookienames, domain=None):
- """Check if all 'cookienames' are in the session's cookiejar"""
- if not self._cookiejar:
+ def cookies_check(self, cookies_names, domain=None):
+ """Check if all 'cookies_names' are in the session's cookiejar"""
+ if not self.cookies:
return False
if domain is None:
- domain = self.cookiedomain
- names = set(cookienames)
+ domain = self.cookies_domain
+ names = set(cookies_names)
now = time.time()
- for cookie in self._cookiejar:
+ for cookie in self.cookies:
if cookie.name in names and (
not domain or cookie.domain == domain):
@@ -450,9 +470,16 @@ class Extractor():
return False
def _prepare_ddosguard_cookies(self):
- if not self._cookiejar.get("__ddg2", domain=self.cookiedomain):
- self._cookiejar.set(
- "__ddg2", util.generate_token(), domain=self.cookiedomain)
+ if not self.cookies.get("__ddg2", domain=self.cookies_domain):
+ self.cookies.set(
+ "__ddg2", util.generate_token(), domain=self.cookies_domain)
+
+ def _cache(self, func, maxage, keyarg=None):
+ # return cache.DatabaseCacheDecorator(func, maxage, keyarg)
+ return cache.DatabaseCacheDecorator(func, keyarg, maxage)
+
+ def _cache_memory(self, func, maxage=None, keyarg=None):
+ return cache.Memcache()
def _get_date_min_max(self, dmin=None, dmax=None):
"""Retrieve and parse 'date-min' and 'date-max' config values"""
@@ -489,19 +516,8 @@ class Extractor():
return iter(result)
@classmethod
- def _get_tests(cls):
- """Yield an extractor's test cases as (URL, RESULTS) tuples"""
- tests = cls.test
- if not tests:
- return
-
- if len(tests) == 2 and (not tests[1] or isinstance(tests[1], dict)):
- tests = (tests,)
-
- for test in tests:
- if isinstance(test, str):
- test = (test, None)
- yield test
+ def _dump(cls, obj):
+ util.dump_json(obj, ensure_ascii=False, indent=2)
def _dump_response(self, response, history=True):
"""Write the response content to a .dump file in the current directory.
@@ -654,6 +670,8 @@ class AsynchronousMixin():
"""Run info extraction in a separate thread"""
def __iter__(self):
+ self.initialize()
+
messages = queue.Queue(5)
thread = threading.Thread(
target=self.async_items,
@@ -805,8 +823,8 @@ _browser_cookies = {}
HTTP_HEADERS = {
"firefox": (
- ("User-Agent", "Mozilla/5.0 ({}; rv:115.0) "
- "Gecko/20100101 Firefox/115.0"),
+ ("User-Agent", "Mozilla/5.0 ({}; "
+ "rv:109.0) Gecko/20100101 Firefox/115.0"),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
"image/avif,image/webp,*/*;q=0.8"),
("Accept-Language", "en-US,en;q=0.5"),
@@ -897,13 +915,3 @@ if action:
except Exception:
pass
del action
-
-# Undo automatic pyOpenSSL injection by requests
-pyopenssl = config.get((), "pyopenssl", False)
-if not pyopenssl:
- try:
- from requests.packages.urllib3.contrib import pyopenssl # noqa
- pyopenssl.extract_from_urllib3()
- except ImportError:
- pass
-del pyopenssl