5 files changed, 455 insertions, 0 deletions
diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py
new file mode 100644
index 0000000..97972cd
--- /dev/null
+++ b/gallery_dl/downloader/__init__.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader modules"""
+
+import importlib
+
+modules = [
+    "http",
+    "text",
+    "ytdl",
+]
+
+
+def find(scheme):
+    """Return downloader class suitable for handling the given scheme"""
+    try:
+        return _cache[scheme]
+    except KeyError:
+        klass = None
+        try:
+            if scheme in modules:  # prevent unwanted imports
+                module = importlib.import_module("." + scheme, __package__)
+                klass = module.__downloader__
+        except (ImportError, AttributeError, TypeError):
+            pass
+        _cache[scheme] = klass
+        return klass
+
+
+# --------------------------------------------------------------------
+# internals
+
+_cache = {}
diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py
new file mode 100644
index 0000000..4803c85
--- /dev/null
+++ b/gallery_dl/downloader/common.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Common classes and constants used by downloader modules."""
+
+import os
+import time
+import logging
+from .. import config, util, exception
+from requests.exceptions import RequestException
+from ssl import SSLError
+
+
+class DownloaderBase():
+    """Base class for downloaders"""
+    scheme = ""
+    retries = 1
+
+    def __init__(self, extractor, output):
+        self.session = extractor.session
+        self.out = output
+        self.log = logging.getLogger("downloader." + self.scheme)
+        self.downloading = False
+        self.part = self.config("part", True)
+        self.partdir = self.config("part-directory")
+
+        if self.partdir:
+            self.partdir = util.expand_path(self.partdir)
+            os.makedirs(self.partdir, exist_ok=True)
+
+    def config(self, key, default=None):
+        """Interpolate config value for 'key'"""
+        return config.interpolate(("downloader", self.scheme, key), default)
+
+    def download(self, url, pathfmt):
+        """Download the resource at 'url' and write it to a file-like object"""
+        try:
+            return self.download_impl(url, pathfmt)
+        except Exception:
+            print()
+            raise
+        finally:
+            # remove file from incomplete downloads
+            if self.downloading and not self.part:
+                try:
+                    os.remove(pathfmt.temppath)
+                except (OSError, AttributeError):
+                    pass
+
+    def download_impl(self, url, pathfmt):
+        """Actual implementaion of the download process"""
+        adj_ext = None
+        tries = 0
+        msg = ""
+
+        if self.part:
+            pathfmt.part_enable(self.partdir)
+
+        while True:
+            self.reset()
+            if tries:
+                self.log.warning("%s (%d/%d)", msg, tries, self.retries)
+                if tries >= self.retries:
+                    return False
+                time.sleep(tries)
+            tries += 1
+
+            # check for .part file
+            filesize = pathfmt.part_size()
+
+            # connect to (remote) source
+            try:
+                offset, size = self.connect(url, filesize)
+            except exception.DownloadRetry as exc:
+                msg = exc
+                continue
+            except exception.DownloadComplete:
+                break
+            except Exception as exc:
+                self.log.warning(exc)
+                return False
+
+            # check response
+            if not offset:
+                mode = "w+b"
+                if filesize:
+                    self.log.info("Unable to resume partial download")
+            else:
+                mode = "r+b"
+                self.log.info("Resuming download at byte %d", offset)
+
+            # set missing filename extension
+            if not pathfmt.has_extension:
+                pathfmt.set_extension(self.get_extension())
+                if pathfmt.exists():
+                    pathfmt.temppath = ""
+                    return True
+
+            self.out.start(pathfmt.path)
+            self.downloading = True
+            with pathfmt.open(mode) as file:
+                if offset:
+                    file.seek(offset)
+
+                # download content
+                try:
+                    self.receive(file)
+                except (RequestException, SSLError) as exc:
+                    msg = exc
+                    print()
+                    continue
+
+                # check filesize
+                if size and file.tell() < size:
+                    msg = "filesize mismatch ({} < {})".format(
+                        file.tell(), size)
+                    continue
+
+                # check filename extension
+                adj_ext = self._check_extension(file, pathfmt)
+
+            break
+
+        self.downloading = False
+        if adj_ext:
+            pathfmt.set_extension(adj_ext)
+        return True
+
+    def connect(self, url, offset):
+        """Connect to 'url' while respecting 'offset' if possible
+
+        Returns a 2-tuple containing the actual offset and expected filesize.
+        If the returned offset-value is greater than zero, all received data
+        will be appended to the existing .part file.
+        Return '0' as second tuple-field to indicate an unknown filesize.
+        """
+
+    def receive(self, file):
+        """Write data to 'file'"""
+
+    def reset(self):
+        """Reset internal state / cleanup"""
+
+    def get_extension(self):
+        """Return a filename extension appropriate for the current request"""
+
+    @staticmethod
+    def _check_extension(file, pathfmt):
+        """Check filename extension against fileheader"""
+        extension = pathfmt.keywords["extension"]
+        if extension in FILETYPE_CHECK:
+            file.seek(0)
+            header = file.read(8)
+            if len(header) >= 8 and not FILETYPE_CHECK[extension](header):
+                for ext, check in FILETYPE_CHECK.items():
+                    if ext != extension and check(header):
+                        return ext
+        return None
+
+
+FILETYPE_CHECK = {
+    "jpg": lambda h: h[0:2] == b"\xff\xd8",
+    "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a",
+    "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97,
+}
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
new file mode 100644
index 0000000..961c1a2
--- /dev/null
+++ b/gallery_dl/downloader/http.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for http:// and https:// URLs"""
+
+import time
+import mimetypes
+from requests.exceptions import ConnectionError, Timeout
+from .common import DownloaderBase
+from .. import text, exception
+
+
+class HttpDownloader(DownloaderBase):
+    scheme = "http"
+
+    def __init__(self, extractor, output):
+        DownloaderBase.__init__(self, extractor, output)
+        self.response = None
+        self.retries = self.config("retries", extractor._retries)
+        self.timeout = self.config("timeout", extractor._timeout)
+        self.verify = self.config("verify", extractor._verify)
+        self.rate = self.config("rate")
+        self.chunk_size = 16384
+
+        if self.rate:
+            self.rate = text.parse_bytes(self.rate)
+            if not self.rate:
+                self.log.warning("Invalid rate limit specified")
+            elif self.rate < self.chunk_size:
+                self.chunk_size = self.rate
+
+    def connect(self, url, offset):
+        headers = {}
+        if offset:
+            headers["Range"] = "bytes={}-".format(offset)
+
+        try:
+            self.response = self.session.request(
+                "GET", url, stream=True, headers=headers, allow_redirects=True,
+                timeout=self.timeout, verify=self.verify)
+        except (ConnectionError, Timeout) as exc:
+            raise exception.DownloadRetry(exc)
+
+        code = self.response.status_code
+        if code == 200:  # OK
+            offset = 0
+            size = self.response.headers.get("Content-Length")
+        elif code == 206:  # Partial Content
+            size = self.response.headers["Content-Range"].rpartition("/")[2]
+        elif code == 416:  # Requested Range Not Satisfiable
+            raise exception.DownloadComplete()
+        elif code == 429 or 500 <= code < 600:  # Server Error
+            raise exception.DownloadRetry(
+                "{} Server Error: {} for url: {}".format(
+                    code, self.response.reason, url))
+        else:
+            self.response.raise_for_status()
+
+        return offset, text.parse_int(size)
+
+    def receive(self, file):
+        if self.rate:
+            total = 0            # total amount of bytes received
+            start = time.time()  # start time
+
+        for data in self.response.iter_content(self.chunk_size):
+            file.write(data)
+
+            if self.rate:
+                total += len(data)
+                expected = total / self.rate  # expected elapsed time
+                delta = time.time() - start   # actual elapsed time since start
+                if delta < expected:
+                    # sleep if less time passed than expected
+                    time.sleep(expected - delta)
+
+    def reset(self):
+        if self.response:
+            self.response.close()
+        self.response = None
+
+    def get_extension(self):
+        mtype = self.response.headers.get("Content-Type", "image/jpeg")
+        mtype = mtype.partition(";")[0]
+
+        if mtype in MIMETYPE_MAP:
+            return MIMETYPE_MAP[mtype]
+
+        exts = mimetypes.guess_all_extensions(mtype, strict=False)
+        if exts:
+            exts.sort()
+            return exts[-1][1:]
+
+        self.log.warning(
+            "No filename extension found for MIME type '%s'", mtype)
+        return "txt"
+
+
+MIMETYPE_MAP = {
+    "image/jpeg": "jpg",
+    "image/jpg": "jpg",
+    "image/png": "png",
+    "image/gif": "gif",
+    "image/bmp": "bmp",
+    "image/webp": "webp",
+    "image/svg+xml": "svg",
+
+    "video/webm": "webm",
+    "video/ogg": "ogg",
+    "video/mp4": "mp4",
+
+    "audio/wav": "wav",
+    "audio/x-wav": "wav",
+    "audio/webm": "webm",
+    "audio/ogg": "ogg",
+    "audio/mpeg": "mp3",
+
+    "application/ogg": "ogg",
+    "application/octet-stream": "bin",
+}
+
+
+__downloader__ = HttpDownloader
diff --git a/gallery_dl/downloader/text.py b/gallery_dl/downloader/text.py
new file mode 100644
index 0000000..ca33863
--- /dev/null
+++ b/gallery_dl/downloader/text.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for text: URLs"""
+
+from .common import DownloaderBase
+
+
+class TextDownloader(DownloaderBase):
+    scheme = "text"
+
+    def __init__(self, extractor, output):
+        DownloaderBase.__init__(self, extractor, output)
+        self.content = b""
+
+    def connect(self, url, offset):
+        data = url.encode()
+        self.content = data[offset + 5:]
+        return offset, len(data) - 5
+
+    def receive(self, file):
+        file.write(self.content)
+
+    def reset(self):
+        self.content = b""
+
+    @staticmethod
+    def get_extension():
+        return "txt"
+
+
+__downloader__ = TextDownloader
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py
new file mode 100644
index 0000000..57a84d0
--- /dev/null
+++ b/gallery_dl/downloader/ytdl.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for URLs requiring youtube-dl support"""
+
+from youtube_dl import YoutubeDL
+from .common import DownloaderBase
+from .. import text
+import os
+
+
+class YoutubeDLDownloader(DownloaderBase):
+    scheme = "ytdl"
+
+    def __init__(self, extractor, output):
+        DownloaderBase.__init__(self, extractor, output)
+
+        options = {
+            "format": self.config("format") or None,
+            "ratelimit": text.parse_bytes(self.config("rate"), None),
+            "retries": self.config("retries", extractor._retries),
+            "socket_timeout": self.config("timeout", extractor._timeout),
+            "nocheckcertificate": not self.config("verify", extractor._verify),
+            "nopart": not self.part,
+        }
+        options.update(self.config("raw-options") or {})
+
+        if self.config("logging", True):
+            options["logger"] = self.log
+
+        self.ytdl = YoutubeDL(options)
+
+    def download(self, url, pathfmt):
+        try:
+            info_dict = self.ytdl.extract_info(url[5:], download=False)
+        except Exception:
+            return False
+
+        if "entries" in info_dict:
+            index = pathfmt.keywords.get("_ytdl_index")
+            if index is None:
+                return self._download_playlist(pathfmt, info_dict)
+            else:
+                info_dict = info_dict["entries"][index]
+        return self._download_video(pathfmt, info_dict)
+
+    def _download_video(self, pathfmt, info_dict):
+        if "url" in info_dict:
+            text.nameext_from_url(info_dict["url"], pathfmt.keywords)
+        pathfmt.set_extension(info_dict["ext"])
+        if pathfmt.exists():
+            pathfmt.temppath = ""
+            return True
+        if self.part and self.partdir:
+            pathfmt.temppath = os.path.join(
+                self.partdir, pathfmt.filename)
+        self.ytdl.params["outtmpl"] = pathfmt.temppath.replace("%", "%%")
+
+        self.out.start(pathfmt.path)
+        try:
+            self.ytdl.process_info(info_dict)
+        except Exception:
+            self.log.debug("Traceback", exc_info=True)
+            return False
+        return True
+
+    def _download_playlist(self, pathfmt, info_dict):
+        pathfmt.set_extension("%(playlist_index)s.%(ext)s")
+        self.ytdl.params["outtmpl"] = pathfmt.realpath
+
+        for entry in info_dict["entries"]:
+            self.ytdl.process_info(entry)
+        return True
+
+
+__downloader__ = YoutubeDLDownloader