summaryrefslogtreecommitdiffstats
path: root/gallery_dl/downloader/http.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/downloader/http.py')
-rw-r--r--gallery_dl/downloader/http.py197
1 files changed, 156 insertions, 41 deletions
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index 961c1a2..7a95191 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2018 Mike Fährmann
+# Copyright 2014-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,11 +8,17 @@
"""Downloader module for http:// and https:// URLs"""
+import os
import time
import mimetypes
-from requests.exceptions import ConnectionError, Timeout
+from requests.exceptions import RequestException, ConnectionError, Timeout
from .common import DownloaderBase
-from .. import text, exception
+from .. import text
+
+try:
+ from OpenSSL.SSL import Error as SSLError
+except ImportError:
+ from ssl import SSLError
class HttpDownloader(DownloaderBase):
@@ -20,13 +26,16 @@ class HttpDownloader(DownloaderBase):
def __init__(self, extractor, output):
DownloaderBase.__init__(self, extractor, output)
- self.response = None
self.retries = self.config("retries", extractor._retries)
self.timeout = self.config("timeout", extractor._timeout)
self.verify = self.config("verify", extractor._verify)
+ self.mtime = self.config("mtime", True)
self.rate = self.config("rate")
+ self.downloading = False
self.chunk_size = 16384
+ if self.retries < 0:
+ self.retries = float("inf")
if self.rate:
self.rate = text.parse_bytes(self.rate)
if not self.rate:
@@ -34,41 +43,132 @@ class HttpDownloader(DownloaderBase):
elif self.rate < self.chunk_size:
self.chunk_size = self.rate
- def connect(self, url, offset):
- headers = {}
- if offset:
- headers["Range"] = "bytes={}-".format(offset)
-
+ def download(self, url, pathfmt):
try:
- self.response = self.session.request(
- "GET", url, stream=True, headers=headers, allow_redirects=True,
- timeout=self.timeout, verify=self.verify)
- except (ConnectionError, Timeout) as exc:
- raise exception.DownloadRetry(exc)
-
- code = self.response.status_code
- if code == 200: # OK
- offset = 0
- size = self.response.headers.get("Content-Length")
- elif code == 206: # Partial Content
- size = self.response.headers["Content-Range"].rpartition("/")[2]
- elif code == 416: # Requested Range Not Satisfiable
- raise exception.DownloadComplete()
- elif code == 429 or 500 <= code < 600: # Server Error
- raise exception.DownloadRetry(
- "{} Server Error: {} for url: {}".format(
- code, self.response.reason, url))
- else:
- self.response.raise_for_status()
-
- return offset, text.parse_int(size)
-
- def receive(self, file):
+ return self._download_impl(url, pathfmt)
+ except Exception:
+ print()
+ raise
+ finally:
+ # remove file from incomplete downloads
+ if self.downloading and not self.part:
+ try:
+ os.unlink(pathfmt.temppath)
+ except (OSError, AttributeError):
+ pass
+
+ def _download_impl(self, url, pathfmt):
+ response = None
+ adj_ext = None
+ tries = 0
+ msg = ""
+
+ if self.part:
+ pathfmt.part_enable(self.partdir)
+
+ while True:
+ if tries:
+ if response:
+ response.close()
+ self.log.warning("%s (%s/%s)", msg, tries, self.retries+1)
+ if tries > self.retries:
+ return False
+ time.sleep(min(2 ** (tries-1), 1800))
+ tries += 1
+
+ # check for .part file
+ filesize = pathfmt.part_size()
+ if filesize:
+ headers = {"Range": "bytes={}-".format(filesize)}
+ else:
+ headers = None
+
+ # connect to (remote) source
+ try:
+ response = self.session.request(
+ "GET", url, stream=True, headers=headers,
+ timeout=self.timeout, verify=self.verify)
+ except (ConnectionError, Timeout) as exc:
+ msg = str(exc)
+ continue
+ except Exception as exc:
+ self.log.warning("%s", exc)
+ return False
+
+ # check response
+ code = response.status_code
+ if code == 200: # OK
+ offset = 0
+ size = response.headers.get("Content-Length")
+ elif code == 206: # Partial Content
+ offset = filesize
+ size = response.headers["Content-Range"].rpartition("/")[2]
+ elif code == 416: # Requested Range Not Satisfiable
+ break
+ else:
+ msg = "{}: {} for url: {}".format(code, response.reason, url)
+ if code == 429 or 500 <= code < 600: # Server Error
+ continue
+ self.log.warning("%s", msg)
+ return False
+ size = text.parse_int(size)
+
+ # set missing filename extension
+ if not pathfmt.has_extension:
+ pathfmt.set_extension(self.get_extension(response))
+ if pathfmt.exists():
+ pathfmt.temppath = ""
+ return True
+
+ # set open mode
+ if not offset:
+ mode = "w+b"
+ if filesize:
+ self.log.info("Unable to resume partial download")
+ else:
+ mode = "r+b"
+ self.log.info("Resuming download at byte %d", offset)
+
+ # start downloading
+ self.out.start(pathfmt.path)
+ self.downloading = True
+ with pathfmt.open(mode) as file:
+ if offset:
+ file.seek(offset)
+
+ # download content
+ try:
+ self.receive(response, file)
+ except (RequestException, SSLError) as exc:
+ msg = str(exc)
+ print()
+ continue
+
+ # check filesize
+ if size and file.tell() < size:
+ msg = "filesize mismatch ({} < {})".format(
+ file.tell(), size)
+ print()
+ continue
+
+ # check filename extension
+ adj_ext = self.check_extension(file, pathfmt)
+
+ break
+
+ self.downloading = False
+ if adj_ext:
+ pathfmt.set_extension(adj_ext)
+ if self.mtime:
+ pathfmt.keywords["_mtime"] = response.headers.get("Last-Modified")
+ return True
+
+ def receive(self, response, file):
if self.rate:
total = 0 # total amount of bytes received
start = time.time() # start time
- for data in self.response.iter_content(self.chunk_size):
+ for data in response.iter_content(self.chunk_size):
file.write(data)
if self.rate:
@@ -79,13 +179,8 @@ class HttpDownloader(DownloaderBase):
# sleep if less time passed than expected
time.sleep(expected - delta)
- def reset(self):
- if self.response:
- self.response.close()
- self.response = None
-
- def get_extension(self):
- mtype = self.response.headers.get("Content-Type", "image/jpeg")
+ def get_extension(self, response):
+ mtype = response.headers.get("Content-Type", "image/jpeg")
mtype = mtype.partition(";")[0]
if mtype in MIMETYPE_MAP:
@@ -100,6 +195,26 @@ class HttpDownloader(DownloaderBase):
"No filename extension found for MIME type '%s'", mtype)
return "txt"
+ @staticmethod
+ def check_extension(file, pathfmt):
+ """Check filename extension against fileheader"""
+ extension = pathfmt.keywords["extension"]
+ if extension in FILETYPE_CHECK:
+ file.seek(0)
+ header = file.read(8)
+ if len(header) >= 8 and not FILETYPE_CHECK[extension](header):
+ for ext, check in FILETYPE_CHECK.items():
+ if ext != extension and check(header):
+ return ext
+ return None
+
+
+FILETYPE_CHECK = {
+ "jpg": lambda h: h[0:2] == b"\xff\xd8",
+ "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a",
+ "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97,
+}
+
MIMETYPE_MAP = {
"image/jpeg": "jpg",