summaryrefslogtreecommitdiffstats
path: root/gallery_dl/downloader
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/downloader')
-rw-r--r--gallery_dl/downloader/__init__.py39
-rw-r--r--gallery_dl/downloader/common.py170
-rw-r--r--gallery_dl/downloader/http.py128
-rw-r--r--gallery_dl/downloader/text.py37
-rw-r--r--gallery_dl/downloader/ytdl.py81
5 files changed, 455 insertions, 0 deletions
diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py
new file mode 100644
index 0000000..97972cd
--- /dev/null
+++ b/gallery_dl/downloader/__init__.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader modules"""
+
+import importlib
+
+modules = [
+ "http",
+ "text",
+ "ytdl",
+]
+
+
+def find(scheme):
+ """Return downloader class suitable for handling the given scheme"""
+ try:
+ return _cache[scheme]
+ except KeyError:
+ klass = None
+ try:
+ if scheme in modules: # prevent unwanted imports
+ module = importlib.import_module("." + scheme, __package__)
+ klass = module.__downloader__
+ except (ImportError, AttributeError, TypeError):
+ pass
+ _cache[scheme] = klass
+ return klass
+
+
+# --------------------------------------------------------------------
+# internals
+
+_cache = {}
diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py
new file mode 100644
index 0000000..4803c85
--- /dev/null
+++ b/gallery_dl/downloader/common.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Common classes and constants used by downloader modules."""
+
+import os
+import time
+import logging
+from .. import config, util, exception
+from requests.exceptions import RequestException
+from ssl import SSLError
+
+
+class DownloaderBase():
+ """Base class for downloaders"""
+ scheme = ""
+ retries = 1
+
+ def __init__(self, extractor, output):
+ self.session = extractor.session
+ self.out = output
+ self.log = logging.getLogger("downloader." + self.scheme)
+ self.downloading = False
+ self.part = self.config("part", True)
+ self.partdir = self.config("part-directory")
+
+ if self.partdir:
+ self.partdir = util.expand_path(self.partdir)
+ os.makedirs(self.partdir, exist_ok=True)
+
+ def config(self, key, default=None):
+ """Interpolate config value for 'key'"""
+ return config.interpolate(("downloader", self.scheme, key), default)
+
+ def download(self, url, pathfmt):
+ """Download the resource at 'url' and write it to a file-like object"""
+ try:
+ return self.download_impl(url, pathfmt)
+ except Exception:
+ print()
+ raise
+ finally:
+ # remove file from incomplete downloads
+ if self.downloading and not self.part:
+ try:
+ os.remove(pathfmt.temppath)
+ except (OSError, AttributeError):
+ pass
+
+ def download_impl(self, url, pathfmt):
+ """Actual implementaion of the download process"""
+ adj_ext = None
+ tries = 0
+ msg = ""
+
+ if self.part:
+ pathfmt.part_enable(self.partdir)
+
+ while True:
+ self.reset()
+ if tries:
+ self.log.warning("%s (%d/%d)", msg, tries, self.retries)
+ if tries >= self.retries:
+ return False
+ time.sleep(tries)
+ tries += 1
+
+ # check for .part file
+ filesize = pathfmt.part_size()
+
+ # connect to (remote) source
+ try:
+ offset, size = self.connect(url, filesize)
+ except exception.DownloadRetry as exc:
+ msg = exc
+ continue
+ except exception.DownloadComplete:
+ break
+ except Exception as exc:
+ self.log.warning(exc)
+ return False
+
+ # check response
+ if not offset:
+ mode = "w+b"
+ if filesize:
+ self.log.info("Unable to resume partial download")
+ else:
+ mode = "r+b"
+ self.log.info("Resuming download at byte %d", offset)
+
+ # set missing filename extension
+ if not pathfmt.has_extension:
+ pathfmt.set_extension(self.get_extension())
+ if pathfmt.exists():
+ pathfmt.temppath = ""
+ return True
+
+ self.out.start(pathfmt.path)
+ self.downloading = True
+ with pathfmt.open(mode) as file:
+ if offset:
+ file.seek(offset)
+
+ # download content
+ try:
+ self.receive(file)
+ except (RequestException, SSLError) as exc:
+ msg = exc
+ print()
+ continue
+
+ # check filesize
+ if size and file.tell() < size:
+ msg = "filesize mismatch ({} < {})".format(
+ file.tell(), size)
+ continue
+
+ # check filename extension
+ adj_ext = self._check_extension(file, pathfmt)
+
+ break
+
+ self.downloading = False
+ if adj_ext:
+ pathfmt.set_extension(adj_ext)
+ return True
+
+ def connect(self, url, offset):
+ """Connect to 'url' while respecting 'offset' if possible
+
+ Returns a 2-tuple containing the actual offset and expected filesize.
+ If the returned offset-value is greater than zero, all received data
+ will be appended to the existing .part file.
+ Return '0' as second tuple-field to indicate an unknown filesize.
+ """
+
+ def receive(self, file):
+ """Write data to 'file'"""
+
+ def reset(self):
+ """Reset internal state / cleanup"""
+
+ def get_extension(self):
+ """Return a filename extension appropriate for the current request"""
+
+ @staticmethod
+ def _check_extension(file, pathfmt):
+ """Check filename extension against fileheader"""
+ extension = pathfmt.keywords["extension"]
+ if extension in FILETYPE_CHECK:
+ file.seek(0)
+ header = file.read(8)
+ if len(header) >= 8 and not FILETYPE_CHECK[extension](header):
+ for ext, check in FILETYPE_CHECK.items():
+ if ext != extension and check(header):
+ return ext
+ return None
+
+
+FILETYPE_CHECK = {
+ "jpg": lambda h: h[0:2] == b"\xff\xd8",
+ "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a",
+ "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97,
+}
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
new file mode 100644
index 0000000..961c1a2
--- /dev/null
+++ b/gallery_dl/downloader/http.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for http:// and https:// URLs"""
+
+import time
+import mimetypes
+from requests.exceptions import ConnectionError, Timeout
+from .common import DownloaderBase
+from .. import text, exception
+
+
+class HttpDownloader(DownloaderBase):
+ scheme = "http"
+
+ def __init__(self, extractor, output):
+ DownloaderBase.__init__(self, extractor, output)
+ self.response = None
+ self.retries = self.config("retries", extractor._retries)
+ self.timeout = self.config("timeout", extractor._timeout)
+ self.verify = self.config("verify", extractor._verify)
+ self.rate = self.config("rate")
+ self.chunk_size = 16384
+
+ if self.rate:
+ self.rate = text.parse_bytes(self.rate)
+ if not self.rate:
+ self.log.warning("Invalid rate limit specified")
+ elif self.rate < self.chunk_size:
+ self.chunk_size = self.rate
+
+ def connect(self, url, offset):
+ headers = {}
+ if offset:
+ headers["Range"] = "bytes={}-".format(offset)
+
+ try:
+ self.response = self.session.request(
+ "GET", url, stream=True, headers=headers, allow_redirects=True,
+ timeout=self.timeout, verify=self.verify)
+ except (ConnectionError, Timeout) as exc:
+ raise exception.DownloadRetry(exc)
+
+ code = self.response.status_code
+ if code == 200: # OK
+ offset = 0
+ size = self.response.headers.get("Content-Length")
+ elif code == 206: # Partial Content
+ size = self.response.headers["Content-Range"].rpartition("/")[2]
+ elif code == 416: # Requested Range Not Satisfiable
+ raise exception.DownloadComplete()
+ elif code == 429 or 500 <= code < 600: # Server Error
+ raise exception.DownloadRetry(
+ "{} Server Error: {} for url: {}".format(
+ code, self.response.reason, url))
+ else:
+ self.response.raise_for_status()
+
+ return offset, text.parse_int(size)
+
+ def receive(self, file):
+ if self.rate:
+ total = 0 # total amount of bytes received
+ start = time.time() # start time
+
+ for data in self.response.iter_content(self.chunk_size):
+ file.write(data)
+
+ if self.rate:
+ total += len(data)
+ expected = total / self.rate # expected elapsed time
+ delta = time.time() - start # actual elapsed time since start
+ if delta < expected:
+ # sleep if less time passed than expected
+ time.sleep(expected - delta)
+
+ def reset(self):
+ if self.response:
+ self.response.close()
+ self.response = None
+
+ def get_extension(self):
+ mtype = self.response.headers.get("Content-Type", "image/jpeg")
+ mtype = mtype.partition(";")[0]
+
+ if mtype in MIMETYPE_MAP:
+ return MIMETYPE_MAP[mtype]
+
+ exts = mimetypes.guess_all_extensions(mtype, strict=False)
+ if exts:
+ exts.sort()
+ return exts[-1][1:]
+
+ self.log.warning(
+ "No filename extension found for MIME type '%s'", mtype)
+ return "txt"
+
+
+MIMETYPE_MAP = {
+ "image/jpeg": "jpg",
+ "image/jpg": "jpg",
+ "image/png": "png",
+ "image/gif": "gif",
+ "image/bmp": "bmp",
+ "image/webp": "webp",
+ "image/svg+xml": "svg",
+
+ "video/webm": "webm",
+ "video/ogg": "ogg",
+ "video/mp4": "mp4",
+
+ "audio/wav": "wav",
+ "audio/x-wav": "wav",
+ "audio/webm": "webm",
+ "audio/ogg": "ogg",
+ "audio/mpeg": "mp3",
+
+ "application/ogg": "ogg",
+ "application/octet-stream": "bin",
+}
+
+
+__downloader__ = HttpDownloader
diff --git a/gallery_dl/downloader/text.py b/gallery_dl/downloader/text.py
new file mode 100644
index 0000000..ca33863
--- /dev/null
+++ b/gallery_dl/downloader/text.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for text: URLs"""
+
+from .common import DownloaderBase
+
+
+class TextDownloader(DownloaderBase):
+ scheme = "text"
+
+ def __init__(self, extractor, output):
+ DownloaderBase.__init__(self, extractor, output)
+ self.content = b""
+
+ def connect(self, url, offset):
+ data = url.encode()
+ self.content = data[offset + 5:]
+ return offset, len(data) - 5
+
+ def receive(self, file):
+ file.write(self.content)
+
+ def reset(self):
+ self.content = b""
+
+ @staticmethod
+ def get_extension():
+ return "txt"
+
+
+__downloader__ = TextDownloader
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py
new file mode 100644
index 0000000..57a84d0
--- /dev/null
+++ b/gallery_dl/downloader/ytdl.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for URLs requiring youtube-dl support"""
+
+from youtube_dl import YoutubeDL
+from .common import DownloaderBase
+from .. import text
+import os
+
+
+class YoutubeDLDownloader(DownloaderBase):
+ scheme = "ytdl"
+
+ def __init__(self, extractor, output):
+ DownloaderBase.__init__(self, extractor, output)
+
+ options = {
+ "format": self.config("format") or None,
+ "ratelimit": text.parse_bytes(self.config("rate"), None),
+ "retries": self.config("retries", extractor._retries),
+ "socket_timeout": self.config("timeout", extractor._timeout),
+ "nocheckcertificate": not self.config("verify", extractor._verify),
+ "nopart": not self.part,
+ }
+ options.update(self.config("raw-options") or {})
+
+ if self.config("logging", True):
+ options["logger"] = self.log
+
+ self.ytdl = YoutubeDL(options)
+
+ def download(self, url, pathfmt):
+ try:
+ info_dict = self.ytdl.extract_info(url[5:], download=False)
+ except Exception:
+ return False
+
+ if "entries" in info_dict:
+ index = pathfmt.keywords.get("_ytdl_index")
+ if index is None:
+ return self._download_playlist(pathfmt, info_dict)
+ else:
+ info_dict = info_dict["entries"][index]
+ return self._download_video(pathfmt, info_dict)
+
+ def _download_video(self, pathfmt, info_dict):
+ if "url" in info_dict:
+ text.nameext_from_url(info_dict["url"], pathfmt.keywords)
+ pathfmt.set_extension(info_dict["ext"])
+ if pathfmt.exists():
+ pathfmt.temppath = ""
+ return True
+ if self.part and self.partdir:
+ pathfmt.temppath = os.path.join(
+ self.partdir, pathfmt.filename)
+ self.ytdl.params["outtmpl"] = pathfmt.temppath.replace("%", "%%")
+
+ self.out.start(pathfmt.path)
+ try:
+ self.ytdl.process_info(info_dict)
+ except Exception:
+ self.log.debug("Traceback", exc_info=True)
+ return False
+ return True
+
+ def _download_playlist(self, pathfmt, info_dict):
+ pathfmt.set_extension("%(playlist_index)s.%(ext)s")
+ self.ytdl.params["outtmpl"] = pathfmt.realpath
+
+ for entry in info_dict["entries"]:
+ self.ytdl.process_info(entry)
+ return True
+
+
+__downloader__ = YoutubeDLDownloader