diff options
Diffstat (limited to 'gallery_dl/extractor/common.py')
| -rw-r--r-- | gallery_dl/extractor/common.py | 196 |
1 files changed, 102 insertions, 94 deletions
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 5c9b157..0d67df7 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -32,45 +32,21 @@ class Extractor(): directory_fmt = ("{category}",) filename_fmt = "{filename}.{extension}" archive_fmt = "" - cookiedomain = "" - browser = None root = "" - test = None - finalize = None + cookies_domain = "" + referer = True + tls12 = True + browser = None request_interval = 0.0 request_interval_min = 0.0 request_timestamp = 0.0 - tls12 = True def __init__(self, match): self.log = logging.getLogger(self.category) self.url = match.string - - if self.basecategory: - self.config = self._config_shared - self.config_accumulate = self._config_shared_accumulate self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" - self._write_pages = self.config("write-pages", False) - self._retry_codes = self.config("retry-codes") - self._retries = self.config("retries", 4) - self._timeout = self.config("timeout", 30) - self._verify = self.config("verify", True) - self._proxies = util.build_proxy_map(self.config("proxy"), self.log) - self._interval = util.build_duration_func( - self.config("sleep-request", self.request_interval), - self.request_interval_min, - ) - - if self._retries < 0: - self._retries = float("inf") - if not self._retry_codes: - self._retry_codes = () - - self._init_session() - self._init_cookies() - @classmethod def from_url(cls, url): if isinstance(cls.pattern, str): @@ -79,8 +55,19 @@ class Extractor(): return cls(match) if match else None def __iter__(self): + self.initialize() return self.items() + def initialize(self): + self._init_options() + self._init_session() + self._init_cookies() + self._init() + self.initialize = util.noop + + def finalize(self): + pass + def items(self): yield Message.Version, 1 @@ -109,16 +96,22 @@ class Extractor(): return config.accumulate(self._cfgpath, key) def _config_shared(self, key, default=None): - return config.interpolate_common(("extractor",), ( - (self.category, self.subcategory), - (self.basecategory, self.subcategory), - ), key, default) + return config.interpolate_common( + ("extractor",), self._cfgpath, key, default) def _config_shared_accumulate(self, key): - values = config.accumulate(self._cfgpath, key) - conf = config.get(("extractor",), self.basecategory) - if conf: - values[:0] = config.accumulate((self.subcategory,), key, conf=conf) + first = True + extr = ("extractor",) + + for path in self._cfgpath: + if first: + first = False + values = config.accumulate(extr + path, key) + else: + conf = config.get(extr, path[0]) + if conf: + values[:0] = config.accumulate( + (self.subcategory,), key, conf=conf) return values def request(self, url, method="GET", session=None, @@ -245,6 +238,26 @@ class Extractor(): return username, password + def _init(self): + pass + + def _init_options(self): + self._write_pages = self.config("write-pages", False) + self._retry_codes = self.config("retry-codes") + self._retries = self.config("retries", 4) + self._timeout = self.config("timeout", 30) + self._verify = self.config("verify", True) + self._proxies = util.build_proxy_map(self.config("proxy"), self.log) + self._interval = util.build_duration_func( + self.config("sleep-request", self.request_interval), + self.request_interval_min, + ) + + if self._retries < 0: + self._retries = float("inf") + if not self._retry_codes: + self._retry_codes = () + def _init_session(self): self.session = session = requests.Session() headers = session.headers @@ -286,7 +299,7 @@ class Extractor(): useragent = self.config("user-agent") if useragent is None: useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:115.0) Gecko/20100101 Firefox/115.0") + "rv:109.0) Gecko/20100101 Firefox/115.0") elif useragent == "browser": useragent = _browser_useragent() headers["User-Agent"] = useragent @@ -298,6 +311,13 @@ class Extractor(): else: headers["Accept-Encoding"] = "gzip, deflate" + referer = self.config("referer", self.referer) + if referer: + if isinstance(referer, str): + headers["Referer"] = referer + elif self.root: + headers["Referer"] = self.root + "/" + custom_headers = self.config("headers") if custom_headers: headers.update(custom_headers) @@ -330,26 +350,26 @@ class Extractor(): def _init_cookies(self): """Populate the session's cookiejar""" - self._cookiefile = None - self._cookiejar = self.session.cookies - if self.cookiedomain is None: + self.cookies = self.session.cookies + self.cookies_file = None + if self.cookies_domain is None: return cookies = self.config("cookies") if cookies: if isinstance(cookies, dict): - self._update_cookies_dict(cookies, self.cookiedomain) + self.cookies_update_dict(cookies, self.cookies_domain) elif isinstance(cookies, str): - cookiefile = util.expand_path(cookies) + path = util.expand_path(cookies) try: - with open(cookiefile) as fp: - util.cookiestxt_load(fp, self._cookiejar) + with open(path) as fp: + util.cookiestxt_load(fp, self.cookies) except Exception as exc: self.log.warning("cookies: %s", exc) else: self.log.debug("Loading cookies from '%s'", cookies) - self._cookiefile = cookiefile + self.cookies_file = path elif isinstance(cookies, (list, tuple)): key = tuple(cookies) @@ -357,7 +377,7 @@ class Extractor(): if cookiejar is None: from ..cookies import load_cookies - cookiejar = self._cookiejar.__class__() + cookiejar = self.cookies.__class__() try: load_cookies(cookiejar, cookies) except Exception as exc: @@ -367,9 +387,9 @@ class Extractor(): else: self.log.debug("Using cached cookies from %s", key) - setcookie = self._cookiejar.set_cookie + set_cookie = self.cookies.set_cookie for cookie in cookiejar: - setcookie(cookie) + set_cookie(cookie) else: self.log.warning( @@ -377,8 +397,8 @@ class Extractor(): "option, got '%s' (%s)", cookies.__class__.__name__, cookies) - def _store_cookies(self): - """Store the session's cookiejar in a cookies.txt file""" + def cookies_store(self): + """Store the session's cookies in a cookies.txt file""" export = self.config("cookies-update", True) if not export: return @@ -386,47 +406,47 @@ class Extractor(): if isinstance(export, str): path = util.expand_path(export) else: - path = self._cookiefile + path = self.cookies_file if not path: return try: with open(path, "w") as fp: - util.cookiestxt_store(fp, self._cookiejar) + util.cookiestxt_store(fp, self.cookies) except OSError as exc: self.log.warning("cookies: %s", exc) - def _update_cookies(self, cookies, domain=""): + def cookies_update(self, cookies, domain=""): """Update the session's cookiejar with 'cookies'""" if isinstance(cookies, dict): - self._update_cookies_dict(cookies, domain or self.cookiedomain) + self.cookies_update_dict(cookies, domain or self.cookies_domain) else: - setcookie = self._cookiejar.set_cookie + set_cookie = self.cookies.set_cookie try: cookies = iter(cookies) except TypeError: - setcookie(cookies) + set_cookie(cookies) else: for cookie in cookies: - setcookie(cookie) + set_cookie(cookie) - def _update_cookies_dict(self, cookiedict, domain): + def cookies_update_dict(self, cookiedict, domain): """Update cookiejar with name-value pairs from a dict""" - setcookie = self._cookiejar.set + set_cookie = self.cookies.set for name, value in cookiedict.items(): - setcookie(name, value, domain=domain) + set_cookie(name, value, domain=domain) - def _check_cookies(self, cookienames, domain=None): - """Check if all 'cookienames' are in the session's cookiejar""" - if not self._cookiejar: + def cookies_check(self, cookies_names, domain=None): + """Check if all 'cookies_names' are in the session's cookiejar""" + if not self.cookies: return False if domain is None: - domain = self.cookiedomain - names = set(cookienames) + domain = self.cookies_domain + names = set(cookies_names) now = time.time() - for cookie in self._cookiejar: + for cookie in self.cookies: if cookie.name in names and ( not domain or cookie.domain == domain): @@ -450,9 +470,16 @@ class Extractor(): return False def _prepare_ddosguard_cookies(self): - if not self._cookiejar.get("__ddg2", domain=self.cookiedomain): - self._cookiejar.set( - "__ddg2", util.generate_token(), domain=self.cookiedomain) + if not self.cookies.get("__ddg2", domain=self.cookies_domain): + self.cookies.set( + "__ddg2", util.generate_token(), domain=self.cookies_domain) + + def _cache(self, func, maxage, keyarg=None): + # return cache.DatabaseCacheDecorator(func, maxage, keyarg) + return cache.DatabaseCacheDecorator(func, keyarg, maxage) + + def _cache_memory(self, func, maxage=None, keyarg=None): + return cache.Memcache() def _get_date_min_max(self, dmin=None, dmax=None): """Retrieve and parse 'date-min' and 'date-max' config values""" @@ -489,19 +516,8 @@ class Extractor(): return iter(result) @classmethod - def _get_tests(cls): - """Yield an extractor's test cases as (URL, RESULTS) tuples""" - tests = cls.test - if not tests: - return - - if len(tests) == 2 and (not tests[1] or isinstance(tests[1], dict)): - tests = (tests,) - - for test in tests: - if isinstance(test, str): - test = (test, None) - yield test + def _dump(cls, obj): + util.dump_json(obj, ensure_ascii=False, indent=2) def _dump_response(self, response, history=True): """Write the response content to a .dump file in the current directory. @@ -654,6 +670,8 @@ class AsynchronousMixin(): """Run info extraction in a separate thread""" def __iter__(self): + self.initialize() + messages = queue.Queue(5) thread = threading.Thread( target=self.async_items, @@ -805,8 +823,8 @@ _browser_cookies = {} HTTP_HEADERS = { "firefox": ( - ("User-Agent", "Mozilla/5.0 ({}; rv:115.0) " - "Gecko/20100101 Firefox/115.0"), + ("User-Agent", "Mozilla/5.0 ({}; " + "rv:109.0) Gecko/20100101 Firefox/115.0"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," "image/avif,image/webp,*/*;q=0.8"), ("Accept-Language", "en-US,en;q=0.5"), @@ -897,13 +915,3 @@ if action: except Exception: pass del action - -# Undo automatic pyOpenSSL injection by requests -pyopenssl = config.get((), "pyopenssl", False) -if not pyopenssl: - try: - from requests.packages.urllib3.contrib import pyopenssl # noqa - pyopenssl.extract_from_urllib3() - except ImportError: - pass -del pyopenssl |
