summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/common.py')
-rw-r--r--gallery_dl/extractor/common.py93
1 files changed, 58 insertions, 35 deletions
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index a90af1c..0d258eb 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -18,7 +18,7 @@ import requests
import threading
import http.cookiejar
from .message import Message
-from .. import config, text, exception, cloudflare
+from .. import config, text, util, exception, cloudflare
class Extractor():
@@ -37,9 +37,9 @@ class Extractor():
self.session = requests.Session()
self.log = logging.getLogger(self.category)
self.url = match.string
- self._init_headers()
- self._init_cookies()
- self._init_proxies()
+
+ self._cookiefile = None
+ self._cookiejar = self.session.cookies
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
@@ -47,6 +47,10 @@ class Extractor():
if self._retries < 0:
self._retries = float("inf")
+ self._init_headers()
+ self._init_cookies()
+ self._init_proxies()
+
@classmethod
def from_url(cls, url):
if isinstance(cls.pattern, str):
@@ -67,7 +71,7 @@ class Extractor():
return config.interpolate(
("extractor", self.category, self.subcategory, key), default)
- def request(self, url, method="GET", *, session=None, retries=None,
+ def request(self, url, *, method="GET", session=None, retries=None,
encoding=None, fatal=True, notfound=None, **kwargs):
tries = 1
retries = self._retries if retries is None else retries
@@ -110,7 +114,7 @@ class Extractor():
msg = ""
self.log.warning("Cloudflare CAPTCHA" + msg)
- msg = "{}: {} for url: {}".format(code, response.reason, url)
+ msg = "'{} {}' for '{}'".format(code, response.reason, url)
if code < 500 and code != 429 and code != 430:
break
@@ -141,7 +145,7 @@ class Extractor():
return username, password
def _init_headers(self):
- """Set additional headers for the 'session' object"""
+ """Initialize HTTP headers for the 'session' object"""
headers = self.session.headers
headers.clear()
@@ -174,26 +178,43 @@ class Extractor():
if cookies:
if isinstance(cookies, dict):
self._update_cookies_dict(cookies, self.cookiedomain)
- else:
+ elif isinstance(cookies, str):
+ cookiefile = util.expand_path(cookies)
cookiejar = http.cookiejar.MozillaCookieJar()
try:
- cookiejar.load(cookies)
+ cookiejar.load(cookiefile)
except OSError as exc:
self.log.warning("cookies: %s", exc)
else:
- self.session.cookies.update(cookiejar)
+ self._cookiejar.update(cookiejar)
+ self._cookiefile = cookiefile
+ else:
+ self.log.warning(
+ "expected 'dict' or 'str' value for 'cookies' option, "
+ "got '%s' (%s)", cookies.__class__.__name__, cookies)
cookies = cloudflare.cookies(self.category)
if cookies:
domain, cookies = cookies
self._update_cookies_dict(cookies, domain)
+ def _store_cookies(self):
+ """Store the session's cookiejar in a cookies.txt file"""
+ if self._cookiefile and self.config("cookies-update", False):
+ cookiejar = http.cookiejar.MozillaCookieJar()
+ for cookie in self._cookiejar:
+ cookiejar.set_cookie(cookie)
+ try:
+ cookiejar.save(self._cookiefile)
+ except OSError as exc:
+ self.log.warning("cookies: %s", exc)
+
def _update_cookies(self, cookies, *, domain=""):
"""Update the session's cookiejar with 'cookies'"""
if isinstance(cookies, dict):
self._update_cookies_dict(cookies, domain or self.cookiedomain)
else:
- setcookie = self.session.cookies.set_cookie
+ setcookie = self._cookiejar.set_cookie
try:
cookies = iter(cookies)
except TypeError:
@@ -204,17 +225,17 @@ class Extractor():
def _update_cookies_dict(self, cookiedict, domain):
"""Update cookiejar with name-value pairs from a dict"""
- setcookie = self.session.cookies.set
+ setcookie = self._cookiejar.set
for name, value in cookiedict.items():
setcookie(name, value, domain=domain)
- def _check_cookies(self, cookienames, *, domain=""):
+ def _check_cookies(self, cookienames, *, domain=None):
"""Check if all 'cookienames' are in the session's cookiejar"""
- if not domain:
+ if domain is None:
domain = self.cookiedomain
try:
for name in cookienames:
- self.session.cookies._find(name, domain)
+ self._cookiejar._find(name, domain)
except KeyError:
return False
return True
@@ -249,24 +270,21 @@ class Extractor():
yield test
-class ChapterExtractor(Extractor):
+class GalleryExtractor(Extractor):
- subcategory = "chapter"
- directory_fmt = (
- "{category}", "{manga}",
- "{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}")
- filename_fmt = (
- "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
- archive_fmt = (
- "{manga}_{chapter}{chapter_minor}_{page}")
+ subcategory = "gallery"
+ filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
+ directory_fmt = ("{category}", "{gallery_id} {title}")
+ archive_fmt = "{gallery_id}_{num}"
+ enum = "num"
def __init__(self, match, url=None):
Extractor.__init__(self, match)
- self.chapter_url = url or self.root + match.group(1)
+ self.gallery_url = self.root + match.group(1) if url is None else url
def items(self):
self.login()
- page = self.request(self.chapter_url).text
+ page = self.request(self.gallery_url).text
data = self.metadata(page)
imgs = self.images(page)
@@ -284,7 +302,7 @@ class ChapterExtractor(Extractor):
yield Message.Version, 1
yield Message.Directory, data
- for data["page"], (url, imgdata) in images:
+ for data[self.enum], (url, imgdata) in images:
if imgdata:
data.update(imgdata)
yield Message.Url, url, text.nameext_from_url(url, data)
@@ -299,6 +317,19 @@ class ChapterExtractor(Extractor):
"""Return a list of all (image-url, metadata)-tuples"""
+class ChapterExtractor(GalleryExtractor):
+
+ subcategory = "chapter"
+ directory_fmt = (
+ "{category}", "{manga}",
+ "{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}")
+ filename_fmt = (
+ "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
+ archive_fmt = (
+ "{manga}_{chapter}{chapter_minor}_{page}")
+ enum = "page"
+
+
class MangaExtractor(Extractor):
subcategory = "manga"
@@ -333,14 +364,6 @@ class MangaExtractor(Extractor):
"""Return a list of all (chapter-url, metadata)-tuples"""
-class GalleryExtractor(ChapterExtractor):
-
- subcategory = "gallery"
- filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}"
- directory_fmt = ("{category}", "{gallery_id} {title}")
- archive_fmt = "{gallery_id}_{page}"
-
-
class AsynchronousMixin():
"""Run info extraction in a separate thread"""