diff options
Diffstat (limited to 'gallery_dl/downloader')
| -rw-r--r-- | gallery_dl/downloader/__init__.py | 39 | ||||
| -rw-r--r-- | gallery_dl/downloader/common.py | 170 | ||||
| -rw-r--r-- | gallery_dl/downloader/http.py | 128 | ||||
| -rw-r--r-- | gallery_dl/downloader/text.py | 37 | ||||
| -rw-r--r-- | gallery_dl/downloader/ytdl.py | 81 |
5 files changed, 455 insertions, 0 deletions
diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py new file mode 100644 index 0000000..97972cd --- /dev/null +++ b/gallery_dl/downloader/__init__.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Downloader modules""" + +import importlib + +modules = [ + "http", + "text", + "ytdl", +] + + +def find(scheme): + """Return downloader class suitable for handling the given scheme""" + try: + return _cache[scheme] + except KeyError: + klass = None + try: + if scheme in modules: # prevent unwanted imports + module = importlib.import_module("." + scheme, __package__) + klass = module.__downloader__ + except (ImportError, AttributeError, TypeError): + pass + _cache[scheme] = klass + return klass + + +# -------------------------------------------------------------------- +# internals + +_cache = {} diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py new file mode 100644 index 0000000..4803c85 --- /dev/null +++ b/gallery_dl/downloader/common.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Common classes and constants used by downloader modules.""" + +import os +import time +import logging +from .. import config, util, exception +from requests.exceptions import RequestException +from ssl import SSLError + + +class DownloaderBase(): + """Base class for downloaders""" + scheme = "" + retries = 1 + + def __init__(self, extractor, output): + self.session = extractor.session + self.out = output + self.log = logging.getLogger("downloader." + self.scheme) + self.downloading = False + self.part = self.config("part", True) + self.partdir = self.config("part-directory") + + if self.partdir: + self.partdir = util.expand_path(self.partdir) + os.makedirs(self.partdir, exist_ok=True) + + def config(self, key, default=None): + """Interpolate config value for 'key'""" + return config.interpolate(("downloader", self.scheme, key), default) + + def download(self, url, pathfmt): + """Download the resource at 'url' and write it to a file-like object""" + try: + return self.download_impl(url, pathfmt) + except Exception: + print() + raise + finally: + # remove file from incomplete downloads + if self.downloading and not self.part: + try: + os.remove(pathfmt.temppath) + except (OSError, AttributeError): + pass + + def download_impl(self, url, pathfmt): + """Actual implementaion of the download process""" + adj_ext = None + tries = 0 + msg = "" + + if self.part: + pathfmt.part_enable(self.partdir) + + while True: + self.reset() + if tries: + self.log.warning("%s (%d/%d)", msg, tries, self.retries) + if tries >= self.retries: + return False + time.sleep(tries) + tries += 1 + + # check for .part file + filesize = pathfmt.part_size() + + # connect to (remote) source + try: + offset, size = self.connect(url, filesize) + except exception.DownloadRetry as exc: + msg = exc + continue + except exception.DownloadComplete: + break + except Exception as exc: + self.log.warning(exc) + return False + + # check response + if not offset: + mode = "w+b" + if filesize: + self.log.info("Unable to resume partial download") + else: + mode = "r+b" + self.log.info("Resuming download at byte %d", offset) + + # set missing filename extension + if not pathfmt.has_extension: + pathfmt.set_extension(self.get_extension()) + if pathfmt.exists(): + pathfmt.temppath = "" + return True + + self.out.start(pathfmt.path) + self.downloading = True + with pathfmt.open(mode) as file: + if offset: + file.seek(offset) + + # download content + try: + self.receive(file) + except (RequestException, SSLError) as exc: + msg = exc + print() + continue + + # check filesize + if size and file.tell() < size: + msg = "filesize mismatch ({} < {})".format( + file.tell(), size) + continue + + # check filename extension + adj_ext = self._check_extension(file, pathfmt) + + break + + self.downloading = False + if adj_ext: + pathfmt.set_extension(adj_ext) + return True + + def connect(self, url, offset): + """Connect to 'url' while respecting 'offset' if possible + + Returns a 2-tuple containing the actual offset and expected filesize. + If the returned offset-value is greater than zero, all received data + will be appended to the existing .part file. + Return '0' as second tuple-field to indicate an unknown filesize. + """ + + def receive(self, file): + """Write data to 'file'""" + + def reset(self): + """Reset internal state / cleanup""" + + def get_extension(self): + """Return a filename extension appropriate for the current request""" + + @staticmethod + def _check_extension(file, pathfmt): + """Check filename extension against fileheader""" + extension = pathfmt.keywords["extension"] + if extension in FILETYPE_CHECK: + file.seek(0) + header = file.read(8) + if len(header) >= 8 and not FILETYPE_CHECK[extension](header): + for ext, check in FILETYPE_CHECK.items(): + if ext != extension and check(header): + return ext + return None + + +FILETYPE_CHECK = { + "jpg": lambda h: h[0:2] == b"\xff\xd8", + "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", + "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97, +} diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py new file mode 100644 index 0000000..961c1a2 --- /dev/null +++ b/gallery_dl/downloader/http.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Downloader module for http:// and https:// URLs""" + +import time +import mimetypes +from requests.exceptions import ConnectionError, Timeout +from .common import DownloaderBase +from .. import text, exception + + +class HttpDownloader(DownloaderBase): + scheme = "http" + + def __init__(self, extractor, output): + DownloaderBase.__init__(self, extractor, output) + self.response = None + self.retries = self.config("retries", extractor._retries) + self.timeout = self.config("timeout", extractor._timeout) + self.verify = self.config("verify", extractor._verify) + self.rate = self.config("rate") + self.chunk_size = 16384 + + if self.rate: + self.rate = text.parse_bytes(self.rate) + if not self.rate: + self.log.warning("Invalid rate limit specified") + elif self.rate < self.chunk_size: + self.chunk_size = self.rate + + def connect(self, url, offset): + headers = {} + if offset: + headers["Range"] = "bytes={}-".format(offset) + + try: + self.response = self.session.request( + "GET", url, stream=True, headers=headers, allow_redirects=True, + timeout=self.timeout, verify=self.verify) + except (ConnectionError, Timeout) as exc: + raise exception.DownloadRetry(exc) + + code = self.response.status_code + if code == 200: # OK + offset = 0 + size = self.response.headers.get("Content-Length") + elif code == 206: # Partial Content + size = self.response.headers["Content-Range"].rpartition("/")[2] + elif code == 416: # Requested Range Not Satisfiable + raise exception.DownloadComplete() + elif code == 429 or 500 <= code < 600: # Server Error + raise exception.DownloadRetry( + "{} Server Error: {} for url: {}".format( + code, self.response.reason, url)) + else: + self.response.raise_for_status() + + return offset, text.parse_int(size) + + def receive(self, file): + if self.rate: + total = 0 # total amount of bytes received + start = time.time() # start time + + for data in self.response.iter_content(self.chunk_size): + file.write(data) + + if self.rate: + total += len(data) + expected = total / self.rate # expected elapsed time + delta = time.time() - start # actual elapsed time since start + if delta < expected: + # sleep if less time passed than expected + time.sleep(expected - delta) + + def reset(self): + if self.response: + self.response.close() + self.response = None + + def get_extension(self): + mtype = self.response.headers.get("Content-Type", "image/jpeg") + mtype = mtype.partition(";")[0] + + if mtype in MIMETYPE_MAP: + return MIMETYPE_MAP[mtype] + + exts = mimetypes.guess_all_extensions(mtype, strict=False) + if exts: + exts.sort() + return exts[-1][1:] + + self.log.warning( + "No filename extension found for MIME type '%s'", mtype) + return "txt" + + +MIMETYPE_MAP = { + "image/jpeg": "jpg", + "image/jpg": "jpg", + "image/png": "png", + "image/gif": "gif", + "image/bmp": "bmp", + "image/webp": "webp", + "image/svg+xml": "svg", + + "video/webm": "webm", + "video/ogg": "ogg", + "video/mp4": "mp4", + + "audio/wav": "wav", + "audio/x-wav": "wav", + "audio/webm": "webm", + "audio/ogg": "ogg", + "audio/mpeg": "mp3", + + "application/ogg": "ogg", + "application/octet-stream": "bin", +} + + +__downloader__ = HttpDownloader diff --git a/gallery_dl/downloader/text.py b/gallery_dl/downloader/text.py new file mode 100644 index 0000000..ca33863 --- /dev/null +++ b/gallery_dl/downloader/text.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Downloader module for text: URLs""" + +from .common import DownloaderBase + + +class TextDownloader(DownloaderBase): + scheme = "text" + + def __init__(self, extractor, output): + DownloaderBase.__init__(self, extractor, output) + self.content = b"" + + def connect(self, url, offset): + data = url.encode() + self.content = data[offset + 5:] + return offset, len(data) - 5 + + def receive(self, file): + file.write(self.content) + + def reset(self): + self.content = b"" + + @staticmethod + def get_extension(): + return "txt" + + +__downloader__ = TextDownloader diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py new file mode 100644 index 0000000..57a84d0 --- /dev/null +++ b/gallery_dl/downloader/ytdl.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Downloader module for URLs requiring youtube-dl support""" + +from youtube_dl import YoutubeDL +from .common import DownloaderBase +from .. import text +import os + + +class YoutubeDLDownloader(DownloaderBase): + scheme = "ytdl" + + def __init__(self, extractor, output): + DownloaderBase.__init__(self, extractor, output) + + options = { + "format": self.config("format") or None, + "ratelimit": text.parse_bytes(self.config("rate"), None), + "retries": self.config("retries", extractor._retries), + "socket_timeout": self.config("timeout", extractor._timeout), + "nocheckcertificate": not self.config("verify", extractor._verify), + "nopart": not self.part, + } + options.update(self.config("raw-options") or {}) + + if self.config("logging", True): + options["logger"] = self.log + + self.ytdl = YoutubeDL(options) + + def download(self, url, pathfmt): + try: + info_dict = self.ytdl.extract_info(url[5:], download=False) + except Exception: + return False + + if "entries" in info_dict: + index = pathfmt.keywords.get("_ytdl_index") + if index is None: + return self._download_playlist(pathfmt, info_dict) + else: + info_dict = info_dict["entries"][index] + return self._download_video(pathfmt, info_dict) + + def _download_video(self, pathfmt, info_dict): + if "url" in info_dict: + text.nameext_from_url(info_dict["url"], pathfmt.keywords) + pathfmt.set_extension(info_dict["ext"]) + if pathfmt.exists(): + pathfmt.temppath = "" + return True + if self.part and self.partdir: + pathfmt.temppath = os.path.join( + self.partdir, pathfmt.filename) + self.ytdl.params["outtmpl"] = pathfmt.temppath.replace("%", "%%") + + self.out.start(pathfmt.path) + try: + self.ytdl.process_info(info_dict) + except Exception: + self.log.debug("Traceback", exc_info=True) + return False + return True + + def _download_playlist(self, pathfmt, info_dict): + pathfmt.set_extension("%(playlist_index)s.%(ext)s") + self.ytdl.params["outtmpl"] = pathfmt.realpath + + for entry in info_dict["entries"]: + self.ytdl.process_info(entry) + return True + + +__downloader__ = YoutubeDLDownloader |
