diff options
Diffstat (limited to 'gallery_dl/downloader/http.py')
| -rw-r--r-- | gallery_dl/downloader/http.py | 197 |
1 files changed, 156 insertions, 41 deletions
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 961c1a2..7a95191 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2018 Mike Fährmann +# Copyright 2014-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,11 +8,17 @@ """Downloader module for http:// and https:// URLs""" +import os import time import mimetypes -from requests.exceptions import ConnectionError, Timeout +from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase -from .. import text, exception +from .. import text + +try: + from OpenSSL.SSL import Error as SSLError +except ImportError: + from ssl import SSLError class HttpDownloader(DownloaderBase): @@ -20,13 +26,16 @@ class HttpDownloader(DownloaderBase): def __init__(self, extractor, output): DownloaderBase.__init__(self, extractor, output) - self.response = None self.retries = self.config("retries", extractor._retries) self.timeout = self.config("timeout", extractor._timeout) self.verify = self.config("verify", extractor._verify) + self.mtime = self.config("mtime", True) self.rate = self.config("rate") + self.downloading = False self.chunk_size = 16384 + if self.retries < 0: + self.retries = float("inf") if self.rate: self.rate = text.parse_bytes(self.rate) if not self.rate: @@ -34,41 +43,132 @@ class HttpDownloader(DownloaderBase): elif self.rate < self.chunk_size: self.chunk_size = self.rate - def connect(self, url, offset): - headers = {} - if offset: - headers["Range"] = "bytes={}-".format(offset) - + def download(self, url, pathfmt): try: - self.response = self.session.request( - "GET", url, stream=True, headers=headers, allow_redirects=True, - timeout=self.timeout, verify=self.verify) - except (ConnectionError, Timeout) as exc: - raise exception.DownloadRetry(exc) - - code = self.response.status_code - if code == 200: # OK - offset = 0 - size = self.response.headers.get("Content-Length") - elif code == 206: # Partial Content - size = self.response.headers["Content-Range"].rpartition("/")[2] - elif code == 416: # Requested Range Not Satisfiable - raise exception.DownloadComplete() - elif code == 429 or 500 <= code < 600: # Server Error - raise exception.DownloadRetry( - "{} Server Error: {} for url: {}".format( - code, self.response.reason, url)) - else: - self.response.raise_for_status() - - return offset, text.parse_int(size) - - def receive(self, file): + return self._download_impl(url, pathfmt) + except Exception: + print() + raise + finally: + # remove file from incomplete downloads + if self.downloading and not self.part: + try: + os.unlink(pathfmt.temppath) + except (OSError, AttributeError): + pass + + def _download_impl(self, url, pathfmt): + response = None + adj_ext = None + tries = 0 + msg = "" + + if self.part: + pathfmt.part_enable(self.partdir) + + while True: + if tries: + if response: + response.close() + self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) + if tries > self.retries: + return False + time.sleep(min(2 ** (tries-1), 1800)) + tries += 1 + + # check for .part file + filesize = pathfmt.part_size() + if filesize: + headers = {"Range": "bytes={}-".format(filesize)} + else: + headers = None + + # connect to (remote) source + try: + response = self.session.request( + "GET", url, stream=True, headers=headers, + timeout=self.timeout, verify=self.verify) + except (ConnectionError, Timeout) as exc: + msg = str(exc) + continue + except Exception as exc: + self.log.warning("%s", exc) + return False + + # check response + code = response.status_code + if code == 200: # OK + offset = 0 + size = response.headers.get("Content-Length") + elif code == 206: # Partial Content + offset = filesize + size = response.headers["Content-Range"].rpartition("/")[2] + elif code == 416: # Requested Range Not Satisfiable + break + else: + msg = "{}: {} for url: {}".format(code, response.reason, url) + if code == 429 or 500 <= code < 600: # Server Error + continue + self.log.warning("%s", msg) + return False + size = text.parse_int(size) + + # set missing filename extension + if not pathfmt.has_extension: + pathfmt.set_extension(self.get_extension(response)) + if pathfmt.exists(): + pathfmt.temppath = "" + return True + + # set open mode + if not offset: + mode = "w+b" + if filesize: + self.log.info("Unable to resume partial download") + else: + mode = "r+b" + self.log.info("Resuming download at byte %d", offset) + + # start downloading + self.out.start(pathfmt.path) + self.downloading = True + with pathfmt.open(mode) as file: + if offset: + file.seek(offset) + + # download content + try: + self.receive(response, file) + except (RequestException, SSLError) as exc: + msg = str(exc) + print() + continue + + # check filesize + if size and file.tell() < size: + msg = "filesize mismatch ({} < {})".format( + file.tell(), size) + print() + continue + + # check filename extension + adj_ext = self.check_extension(file, pathfmt) + + break + + self.downloading = False + if adj_ext: + pathfmt.set_extension(adj_ext) + if self.mtime: + pathfmt.keywords["_mtime"] = response.headers.get("Last-Modified") + return True + + def receive(self, response, file): if self.rate: total = 0 # total amount of bytes received start = time.time() # start time - for data in self.response.iter_content(self.chunk_size): + for data in response.iter_content(self.chunk_size): file.write(data) if self.rate: @@ -79,13 +179,8 @@ class HttpDownloader(DownloaderBase): # sleep if less time passed than expected time.sleep(expected - delta) - def reset(self): - if self.response: - self.response.close() - self.response = None - - def get_extension(self): - mtype = self.response.headers.get("Content-Type", "image/jpeg") + def get_extension(self, response): + mtype = response.headers.get("Content-Type", "image/jpeg") mtype = mtype.partition(";")[0] if mtype in MIMETYPE_MAP: @@ -100,6 +195,26 @@ class HttpDownloader(DownloaderBase): "No filename extension found for MIME type '%s'", mtype) return "txt" + @staticmethod + def check_extension(file, pathfmt): + """Check filename extension against fileheader""" + extension = pathfmt.keywords["extension"] + if extension in FILETYPE_CHECK: + file.seek(0) + header = file.read(8) + if len(header) >= 8 and not FILETYPE_CHECK[extension](header): + for ext, check in FILETYPE_CHECK.items(): + if ext != extension and check(header): + return ext + return None + + +FILETYPE_CHECK = { + "jpg": lambda h: h[0:2] == b"\xff\xd8", + "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", + "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97, +} + MIMETYPE_MAP = { "image/jpeg": "jpg", |
