diff options
| author | 2019-07-20 05:51:44 -0400 | |
|---|---|---|
| committer | 2019-07-20 05:51:44 -0400 | |
| commit | 2a63a9c9b7032a76894c48ac4d9cea732fcaee49 (patch) | |
| tree | 3d5f633ff69cd393036a3dabc4d4533c8484f9ad /gallery_dl/downloader | |
| parent | 195c45911e79c33cf0bb986721365fb06df5a153 (diff) | |
New upstream version 1.9.0upstream/1.9.0
Diffstat (limited to 'gallery_dl/downloader')
| -rw-r--r-- | gallery_dl/downloader/__init__.py | 21 | ||||
| -rw-r--r-- | gallery_dl/downloader/common.py | 142 | ||||
| -rw-r--r-- | gallery_dl/downloader/http.py | 197 | ||||
| -rw-r--r-- | gallery_dl/downloader/text.py | 27 | ||||
| -rw-r--r-- | gallery_dl/downloader/ytdl.py | 7 |
5 files changed, 189 insertions, 205 deletions
diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py index 97972cd..6fb09e1 100644 --- a/gallery_dl/downloader/__init__.py +++ b/gallery_dl/downloader/__init__.py @@ -22,15 +22,24 @@ def find(scheme): try: return _cache[scheme] except KeyError: - klass = None + pass + + klass = None + if scheme == "https": + scheme = "http" + if scheme in modules: # prevent unwanted imports try: - if scheme in modules: # prevent unwanted imports - module = importlib.import_module("." + scheme, __package__) - klass = module.__downloader__ - except (ImportError, AttributeError, TypeError): + module = importlib.import_module("." + scheme, __package__) + except ImportError: pass + else: + klass = module.__downloader__ + + if scheme == "http": + _cache["http"] = _cache["https"] = klass + else: _cache[scheme] = klass - return klass + return klass # -------------------------------------------------------------------- diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py index 4803c85..6e5cd4c 100644 --- a/gallery_dl/downloader/common.py +++ b/gallery_dl/downloader/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 Mike Fährmann +# Copyright 2014-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,23 +9,18 @@ """Common classes and constants used by downloader modules.""" import os -import time import logging -from .. import config, util, exception -from requests.exceptions import RequestException -from ssl import SSLError +from .. import config, util class DownloaderBase(): """Base class for downloaders""" scheme = "" - retries = 1 def __init__(self, extractor, output): self.session = extractor.session self.out = output self.log = logging.getLogger("downloader." + self.scheme) - self.downloading = False self.part = self.config("part", True) self.partdir = self.config("part-directory") @@ -34,137 +29,8 @@ class DownloaderBase(): os.makedirs(self.partdir, exist_ok=True) def config(self, key, default=None): - """Interpolate config value for 'key'""" + """Interpolate downloader config value for 'key'""" return config.interpolate(("downloader", self.scheme, key), default) def download(self, url, pathfmt): - """Download the resource at 'url' and write it to a file-like object""" - try: - return self.download_impl(url, pathfmt) - except Exception: - print() - raise - finally: - # remove file from incomplete downloads - if self.downloading and not self.part: - try: - os.remove(pathfmt.temppath) - except (OSError, AttributeError): - pass - - def download_impl(self, url, pathfmt): - """Actual implementaion of the download process""" - adj_ext = None - tries = 0 - msg = "" - - if self.part: - pathfmt.part_enable(self.partdir) - - while True: - self.reset() - if tries: - self.log.warning("%s (%d/%d)", msg, tries, self.retries) - if tries >= self.retries: - return False - time.sleep(tries) - tries += 1 - - # check for .part file - filesize = pathfmt.part_size() - - # connect to (remote) source - try: - offset, size = self.connect(url, filesize) - except exception.DownloadRetry as exc: - msg = exc - continue - except exception.DownloadComplete: - break - except Exception as exc: - self.log.warning(exc) - return False - - # check response - if not offset: - mode = "w+b" - if filesize: - self.log.info("Unable to resume partial download") - else: - mode = "r+b" - self.log.info("Resuming download at byte %d", offset) - - # set missing filename extension - if not pathfmt.has_extension: - pathfmt.set_extension(self.get_extension()) - if pathfmt.exists(): - pathfmt.temppath = "" - return True - - self.out.start(pathfmt.path) - self.downloading = True - with pathfmt.open(mode) as file: - if offset: - file.seek(offset) - - # download content - try: - self.receive(file) - except (RequestException, SSLError) as exc: - msg = exc - print() - continue - - # check filesize - if size and file.tell() < size: - msg = "filesize mismatch ({} < {})".format( - file.tell(), size) - continue - - # check filename extension - adj_ext = self._check_extension(file, pathfmt) - - break - - self.downloading = False - if adj_ext: - pathfmt.set_extension(adj_ext) - return True - - def connect(self, url, offset): - """Connect to 'url' while respecting 'offset' if possible - - Returns a 2-tuple containing the actual offset and expected filesize. - If the returned offset-value is greater than zero, all received data - will be appended to the existing .part file. - Return '0' as second tuple-field to indicate an unknown filesize. - """ - - def receive(self, file): - """Write data to 'file'""" - - def reset(self): - """Reset internal state / cleanup""" - - def get_extension(self): - """Return a filename extension appropriate for the current request""" - - @staticmethod - def _check_extension(file, pathfmt): - """Check filename extension against fileheader""" - extension = pathfmt.keywords["extension"] - if extension in FILETYPE_CHECK: - file.seek(0) - header = file.read(8) - if len(header) >= 8 and not FILETYPE_CHECK[extension](header): - for ext, check in FILETYPE_CHECK.items(): - if ext != extension and check(header): - return ext - return None - - -FILETYPE_CHECK = { - "jpg": lambda h: h[0:2] == b"\xff\xd8", - "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", - "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97, -} + """Write data from 'url' into the file specified by 'pathfmt'""" diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 961c1a2..7a95191 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 Mike Fährmann +# Copyright 2014-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,11 +8,17 @@ """Downloader module for http:// and https:// URLs""" +import os import time import mimetypes -from requests.exceptions import ConnectionError, Timeout +from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase -from .. import text, exception +from .. import text + +try: + from OpenSSL.SSL import Error as SSLError +except ImportError: + from ssl import SSLError class HttpDownloader(DownloaderBase): @@ -20,13 +26,16 @@ class HttpDownloader(DownloaderBase): def __init__(self, extractor, output): DownloaderBase.__init__(self, extractor, output) - self.response = None self.retries = self.config("retries", extractor._retries) self.timeout = self.config("timeout", extractor._timeout) self.verify = self.config("verify", extractor._verify) + self.mtime = self.config("mtime", True) self.rate = self.config("rate") + self.downloading = False self.chunk_size = 16384 + if self.retries < 0: + self.retries = float("inf") if self.rate: self.rate = text.parse_bytes(self.rate) if not self.rate: @@ -34,41 +43,132 @@ class HttpDownloader(DownloaderBase): elif self.rate < self.chunk_size: self.chunk_size = self.rate - def connect(self, url, offset): - headers = {} - if offset: - headers["Range"] = "bytes={}-".format(offset) - + def download(self, url, pathfmt): try: - self.response = self.session.request( - "GET", url, stream=True, headers=headers, allow_redirects=True, - timeout=self.timeout, verify=self.verify) - except (ConnectionError, Timeout) as exc: - raise exception.DownloadRetry(exc) - - code = self.response.status_code - if code == 200: # OK - offset = 0 - size = self.response.headers.get("Content-Length") - elif code == 206: # Partial Content - size = self.response.headers["Content-Range"].rpartition("/")[2] - elif code == 416: # Requested Range Not Satisfiable - raise exception.DownloadComplete() - elif code == 429 or 500 <= code < 600: # Server Error - raise exception.DownloadRetry( - "{} Server Error: {} for url: {}".format( - code, self.response.reason, url)) - else: - self.response.raise_for_status() - - return offset, text.parse_int(size) - - def receive(self, file): + return self._download_impl(url, pathfmt) + except Exception: + print() + raise + finally: + # remove file from incomplete downloads + if self.downloading and not self.part: + try: + os.unlink(pathfmt.temppath) + except (OSError, AttributeError): + pass + + def _download_impl(self, url, pathfmt): + response = None + adj_ext = None + tries = 0 + msg = "" + + if self.part: + pathfmt.part_enable(self.partdir) + + while True: + if tries: + if response: + response.close() + self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) + if tries > self.retries: + return False + time.sleep(min(2 ** (tries-1), 1800)) + tries += 1 + + # check for .part file + filesize = pathfmt.part_size() + if filesize: + headers = {"Range": "bytes={}-".format(filesize)} + else: + headers = None + + # connect to (remote) source + try: + response = self.session.request( + "GET", url, stream=True, headers=headers, + timeout=self.timeout, verify=self.verify) + except (ConnectionError, Timeout) as exc: + msg = str(exc) + continue + except Exception as exc: + self.log.warning("%s", exc) + return False + + # check response + code = response.status_code + if code == 200: # OK + offset = 0 + size = response.headers.get("Content-Length") + elif code == 206: # Partial Content + offset = filesize + size = response.headers["Content-Range"].rpartition("/")[2] + elif code == 416: # Requested Range Not Satisfiable + break + else: + msg = "{}: {} for url: {}".format(code, response.reason, url) + if code == 429 or 500 <= code < 600: # Server Error + continue + self.log.warning("%s", msg) + return False + size = text.parse_int(size) + + # set missing filename extension + if not pathfmt.has_extension: + pathfmt.set_extension(self.get_extension(response)) + if pathfmt.exists(): + pathfmt.temppath = "" + return True + + # set open mode + if not offset: + mode = "w+b" + if filesize: + self.log.info("Unable to resume partial download") + else: + mode = "r+b" + self.log.info("Resuming download at byte %d", offset) + + # start downloading + self.out.start(pathfmt.path) + self.downloading = True + with pathfmt.open(mode) as file: + if offset: + file.seek(offset) + + # download content + try: + self.receive(response, file) + except (RequestException, SSLError) as exc: + msg = str(exc) + print() + continue + + # check filesize + if size and file.tell() < size: + msg = "filesize mismatch ({} < {})".format( + file.tell(), size) + print() + continue + + # check filename extension + adj_ext = self.check_extension(file, pathfmt) + + break + + self.downloading = False + if adj_ext: + pathfmt.set_extension(adj_ext) + if self.mtime: + pathfmt.keywords["_mtime"] = response.headers.get("Last-Modified") + return True + + def receive(self, response, file): if self.rate: total = 0 # total amount of bytes received start = time.time() # start time - for data in self.response.iter_content(self.chunk_size): + for data in response.iter_content(self.chunk_size): file.write(data) if self.rate: @@ -79,13 +179,8 @@ class HttpDownloader(DownloaderBase): # sleep if less time passed than expected time.sleep(expected - delta) - def reset(self): - if self.response: - self.response.close() - self.response = None - - def get_extension(self): - mtype = self.response.headers.get("Content-Type", "image/jpeg") + def get_extension(self, response): + mtype = response.headers.get("Content-Type", "image/jpeg") mtype = mtype.partition(";")[0] if mtype in MIMETYPE_MAP: @@ -100,6 +195,26 @@ class HttpDownloader(DownloaderBase): "No filename extension found for MIME type '%s'", mtype) return "txt" + @staticmethod + def check_extension(file, pathfmt): + """Check filename extension against fileheader""" + extension = pathfmt.keywords["extension"] + if extension in FILETYPE_CHECK: + file.seek(0) + header = file.read(8) + if len(header) >= 8 and not FILETYPE_CHECK[extension](header): + for ext, check in FILETYPE_CHECK.items(): + if ext != extension and check(header): + return ext + return None + + +FILETYPE_CHECK = { + "jpg": lambda h: h[0:2] == b"\xff\xd8", + "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", + "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97, +} + MIMETYPE_MAP = { "image/jpeg": "jpg", diff --git a/gallery_dl/downloader/text.py b/gallery_dl/downloader/text.py index ca33863..c57fbd0 100644 --- a/gallery_dl/downloader/text.py +++ b/gallery_dl/downloader/text.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 Mike Fährmann +# Copyright 2014-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -14,24 +14,13 @@ from .common import DownloaderBase class TextDownloader(DownloaderBase): scheme = "text" - def __init__(self, extractor, output): - DownloaderBase.__init__(self, extractor, output) - self.content = b"" - - def connect(self, url, offset): - data = url.encode() - self.content = data[offset + 5:] - return offset, len(data) - 5 - - def receive(self, file): - file.write(self.content) - - def reset(self): - self.content = b"" - - @staticmethod - def get_extension(): - return "txt" + def download(self, url, pathfmt): + if self.part: + pathfmt.part_enable(self.partdir) + self.out.start(pathfmt.path) + with pathfmt.open("wb") as file: + file.write(url.encode()[5:]) + return True __downloader__ = TextDownloader diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 57a84d0..da57935 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -20,13 +20,15 @@ class YoutubeDLDownloader(DownloaderBase): def __init__(self, extractor, output): DownloaderBase.__init__(self, extractor, output) + retries = self.config("retries", extractor._retries) options = { "format": self.config("format") or None, "ratelimit": text.parse_bytes(self.config("rate"), None), - "retries": self.config("retries", extractor._retries), + "retries": retries+1 if retries >= 0 else float("inf"), "socket_timeout": self.config("timeout", extractor._timeout), "nocheckcertificate": not self.config("verify", extractor._verify), "nopart": not self.part, + "updatetime": self.config("mtime", True), } options.update(self.config("raw-options") or {}) @@ -36,6 +38,9 @@ class YoutubeDLDownloader(DownloaderBase): self.ytdl = YoutubeDL(options) def download(self, url, pathfmt): + for cookie in self.session.cookies: + self.ytdl.cookiejar.set_cookie(cookie) + try: info_dict = self.ytdl.extract_info(url[5:], download=False) except Exception: |
