diff options
Diffstat (limited to 'gallery_dl/extractor/common.py')
| -rw-r--r-- | gallery_dl/extractor/common.py | 93 |
1 files changed, 58 insertions, 35 deletions
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index a90af1c..0d258eb 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -18,7 +18,7 @@ import requests import threading import http.cookiejar from .message import Message -from .. import config, text, exception, cloudflare +from .. import config, text, util, exception, cloudflare class Extractor(): @@ -37,9 +37,9 @@ class Extractor(): self.session = requests.Session() self.log = logging.getLogger(self.category) self.url = match.string - self._init_headers() - self._init_cookies() - self._init_proxies() + + self._cookiefile = None + self._cookiejar = self.session.cookies self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) self._verify = self.config("verify", True) @@ -47,6 +47,10 @@ class Extractor(): if self._retries < 0: self._retries = float("inf") + self._init_headers() + self._init_cookies() + self._init_proxies() + @classmethod def from_url(cls, url): if isinstance(cls.pattern, str): @@ -67,7 +71,7 @@ class Extractor(): return config.interpolate( ("extractor", self.category, self.subcategory, key), default) - def request(self, url, method="GET", *, session=None, retries=None, + def request(self, url, *, method="GET", session=None, retries=None, encoding=None, fatal=True, notfound=None, **kwargs): tries = 1 retries = self._retries if retries is None else retries @@ -110,7 +114,7 @@ class Extractor(): msg = "" self.log.warning("Cloudflare CAPTCHA" + msg) - msg = "{}: {} for url: {}".format(code, response.reason, url) + msg = "'{} {}' for '{}'".format(code, response.reason, url) if code < 500 and code != 429 and code != 430: break @@ -141,7 +145,7 @@ class Extractor(): return username, password def _init_headers(self): - """Set additional headers for the 'session' object""" + """Initialize HTTP headers for the 'session' object""" headers = self.session.headers headers.clear() @@ -174,26 +178,43 @@ class Extractor(): if cookies: if isinstance(cookies, dict): self._update_cookies_dict(cookies, self.cookiedomain) - else: + elif isinstance(cookies, str): + cookiefile = util.expand_path(cookies) cookiejar = http.cookiejar.MozillaCookieJar() try: - cookiejar.load(cookies) + cookiejar.load(cookiefile) except OSError as exc: self.log.warning("cookies: %s", exc) else: - self.session.cookies.update(cookiejar) + self._cookiejar.update(cookiejar) + self._cookiefile = cookiefile + else: + self.log.warning( + "expected 'dict' or 'str' value for 'cookies' option, " + "got '%s' (%s)", cookies.__class__.__name__, cookies) cookies = cloudflare.cookies(self.category) if cookies: domain, cookies = cookies self._update_cookies_dict(cookies, domain) + def _store_cookies(self): + """Store the session's cookiejar in a cookies.txt file""" + if self._cookiefile and self.config("cookies-update", False): + cookiejar = http.cookiejar.MozillaCookieJar() + for cookie in self._cookiejar: + cookiejar.set_cookie(cookie) + try: + cookiejar.save(self._cookiefile) + except OSError as exc: + self.log.warning("cookies: %s", exc) + def _update_cookies(self, cookies, *, domain=""): """Update the session's cookiejar with 'cookies'""" if isinstance(cookies, dict): self._update_cookies_dict(cookies, domain or self.cookiedomain) else: - setcookie = self.session.cookies.set_cookie + setcookie = self._cookiejar.set_cookie try: cookies = iter(cookies) except TypeError: @@ -204,17 +225,17 @@ class Extractor(): def _update_cookies_dict(self, cookiedict, domain): """Update cookiejar with name-value pairs from a dict""" - setcookie = self.session.cookies.set + setcookie = self._cookiejar.set for name, value in cookiedict.items(): setcookie(name, value, domain=domain) - def _check_cookies(self, cookienames, *, domain=""): + def _check_cookies(self, cookienames, *, domain=None): """Check if all 'cookienames' are in the session's cookiejar""" - if not domain: + if domain is None: domain = self.cookiedomain try: for name in cookienames: - self.session.cookies._find(name, domain) + self._cookiejar._find(name, domain) except KeyError: return False return True @@ -249,24 +270,21 @@ class Extractor(): yield test -class ChapterExtractor(Extractor): +class GalleryExtractor(Extractor): - subcategory = "chapter" - directory_fmt = ( - "{category}", "{manga}", - "{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}") - filename_fmt = ( - "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}") - archive_fmt = ( - "{manga}_{chapter}{chapter_minor}_{page}") + subcategory = "gallery" + filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" + directory_fmt = ("{category}", "{gallery_id} {title}") + archive_fmt = "{gallery_id}_{num}" + enum = "num" def __init__(self, match, url=None): Extractor.__init__(self, match) - self.chapter_url = url or self.root + match.group(1) + self.gallery_url = self.root + match.group(1) if url is None else url def items(self): self.login() - page = self.request(self.chapter_url).text + page = self.request(self.gallery_url).text data = self.metadata(page) imgs = self.images(page) @@ -284,7 +302,7 @@ class ChapterExtractor(Extractor): yield Message.Version, 1 yield Message.Directory, data - for data["page"], (url, imgdata) in images: + for data[self.enum], (url, imgdata) in images: if imgdata: data.update(imgdata) yield Message.Url, url, text.nameext_from_url(url, data) @@ -299,6 +317,19 @@ class ChapterExtractor(Extractor): """Return a list of all (image-url, metadata)-tuples""" +class ChapterExtractor(GalleryExtractor): + + subcategory = "chapter" + directory_fmt = ( + "{category}", "{manga}", + "{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}") + filename_fmt = ( + "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}") + archive_fmt = ( + "{manga}_{chapter}{chapter_minor}_{page}") + enum = "page" + + class MangaExtractor(Extractor): subcategory = "manga" @@ -333,14 +364,6 @@ class MangaExtractor(Extractor): """Return a list of all (chapter-url, metadata)-tuples""" -class GalleryExtractor(ChapterExtractor): - - subcategory = "gallery" - filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}" - directory_fmt = ("{category}", "{gallery_id} {title}") - archive_fmt = "{gallery_id}_{page}" - - class AsynchronousMixin(): """Run info extraction in a separate thread""" |
