summaryrefslogtreecommitdiffstats
path: root/gallery_dl/downloader/http.py
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2025-07-31 01:22:01 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2025-07-31 01:22:01 -0400
commita6e995c093de8aae2e91a0787281bb34c0b871eb (patch)
tree2d79821b05300d34d8871eb6c9662b359a2de85d /gallery_dl/downloader/http.py
parent7672a750cb74bf31e21d76aad2776367fd476155 (diff)
New upstream version 1.30.2.upstream/1.30.2
Diffstat (limited to 'gallery_dl/downloader/http.py')
-rw-r--r--gallery_dl/downloader/http.py114
1 files changed, 78 insertions, 36 deletions
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index c58e2fb..4595483 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,8 +12,9 @@ import time
import mimetypes
from requests.exceptions import RequestException, ConnectionError, Timeout
from .common import DownloaderBase
-from .. import text, util, output
+from .. import text, util, output, exception
from ssl import SSLError
+FLAGS = util.FLAGS
class HttpDownloader(DownloaderBase):
@@ -29,6 +30,7 @@ class HttpDownloader(DownloaderBase):
self.metadata = extractor.config("http-metadata")
self.progress = self.config("progress", 3.0)
self.validate = self.config("validate", True)
+ self.validate_html = self.config("validate-html", True)
self.headers = self.config("headers")
self.minsize = self.config("filesize-min")
self.maxsize = self.config("filesize-max")
@@ -68,14 +70,16 @@ class HttpDownloader(DownloaderBase):
chunk_size = 32768
self.chunk_size = chunk_size
if self.rate:
- rate = text.parse_bytes(self.rate)
- if rate:
- if rate < self.chunk_size:
- self.chunk_size = rate
- self.rate = rate
+ func = util.build_selection_func(self.rate, 0, text.parse_bytes)
+ if rmax := func.args[1] if hasattr(func, "args") else func():
+ if rmax < self.chunk_size:
+ # reduce chunk_size to allow for one iteration each second
+ self.chunk_size = rmax
+ self.rate = func
self.receive = self._receive_rate
else:
self.log.warning("Invalid rate limit (%r)", self.rate)
+ self.rate = False
if self.progress is not None:
self.receive = self._receive_rate
if self.progress < 0.0:
@@ -88,8 +92,10 @@ class HttpDownloader(DownloaderBase):
def download(self, url, pathfmt):
try:
return self._download_impl(url, pathfmt)
- except Exception:
- output.stderr_write("\n")
+ except Exception as exc:
+ if self.downloading:
+ output.stderr_write("\n")
+ self.log.debug("", exc_info=exc)
raise
finally:
# remove file from incomplete downloads
@@ -134,16 +140,14 @@ class HttpDownloader(DownloaderBase):
# collect HTTP headers
headers = {"Accept": "*/*"}
# file-specific headers
- extra = kwdict.get("_http_headers")
- if extra:
+ if extra := kwdict.get("_http_headers"):
headers.update(extra)
# general headers
if self.headers:
headers.update(self.headers)
# partial content
- file_size = pathfmt.part_size()
- if file_size:
- headers["Range"] = "bytes={}-".format(file_size)
+ if file_size := pathfmt.part_size():
+ headers["Range"] = f"bytes={file_size}-"
# connect to (remote) source
try:
@@ -161,7 +165,7 @@ class HttpDownloader(DownloaderBase):
reason = exc.args[0].reason
cls = reason.__class__.__name__
pre, _, err = str(reason.args[-1]).partition(":")
- msg = "{}: {}".format(cls, (err or pre).lstrip())
+ msg = f"{cls}: {(err or pre).lstrip()}"
except Exception:
msg = str(exc)
continue
@@ -183,7 +187,7 @@ class HttpDownloader(DownloaderBase):
elif code == 416 and file_size: # Requested Range Not Satisfiable
break
else:
- msg = "'{} {}' for '{}'".format(code, response.reason, url)
+ msg = f"'{code} {response.reason}' for '{url}'"
challenge = util.detect_challenge(response)
if challenge is not None:
@@ -199,8 +203,8 @@ class HttpDownloader(DownloaderBase):
return False
# check for invalid responses
- validate = kwdict.get("_http_validate")
- if validate and self.validate:
+ if self.validate and \
+ (validate := kwdict.get("_http_validate")) is not None:
try:
result = validate(response)
except Exception:
@@ -214,6 +218,14 @@ class HttpDownloader(DownloaderBase):
self.release_conn(response)
self.log.warning("Invalid response")
return False
+ if self.validate_html and response.headers.get(
+ "content-type", "").startswith("text/html") and \
+ pathfmt.extension not in ("html", "htm"):
+ if response.history:
+ self.log.warning("HTTP redirect to '%s'", response.url)
+ else:
+ self.log.warning("HTML response")
+ return False
# check file size
size = text.parse_int(size, None)
@@ -265,19 +277,28 @@ class HttpDownloader(DownloaderBase):
content = response.iter_content(self.chunk_size)
+ validate_sig = kwdict.get("_http_signature")
+ validate_ext = (adjust_extension and
+ pathfmt.extension in SIGNATURE_CHECKS)
+
# check filename extension against file header
- if adjust_extension and not offset and \
- pathfmt.extension in SIGNATURE_CHECKS:
+ if not offset and (validate_ext or validate_sig):
try:
file_header = next(
content if response.raw.chunked
else response.iter_content(16), b"")
except (RequestException, SSLError) as exc:
msg = str(exc)
- output.stderr_write("\n")
continue
- if self._adjust_extension(pathfmt, file_header) and \
- pathfmt.exists():
+ if validate_sig:
+ result = validate_sig(file_header)
+ if result is not True:
+ self.release_conn(response)
+ self.log.warning(
+ result or "Invalid file signature bytes")
+ return False
+ if validate_ext and self._adjust_extension(
+ pathfmt, file_header) and pathfmt.exists():
pathfmt.temppath = ""
response.close()
return True
@@ -294,6 +315,9 @@ class HttpDownloader(DownloaderBase):
# download content
self.downloading = True
with pathfmt.open(mode) as fp:
+ if fp is None:
+ # '.part' file no longer exists
+ break
if file_header:
fp.write(file_header)
offset += len(file_header)
@@ -310,11 +334,16 @@ class HttpDownloader(DownloaderBase):
msg = str(exc)
output.stderr_write("\n")
continue
+ except exception.StopExtraction:
+ response.close()
+ return False
+ except exception.ControlException:
+ response.close()
+ raise
# check file size
if size and fp.tell() < size:
- msg = "file size mismatch ({} < {})".format(
- fp.tell(), size)
+ msg = f"file size mismatch ({fp.tell()} < {size})"
output.stderr_write("\n")
continue
@@ -323,11 +352,11 @@ class HttpDownloader(DownloaderBase):
self.downloading = False
if self.mtime:
if "_http_lastmodified" in kwdict:
- kwdict["_mtime"] = kwdict["_http_lastmodified"]
+ kwdict["_mtime_http"] = kwdict["_http_lastmodified"]
else:
- kwdict["_mtime"] = response.headers.get("Last-Modified")
+ kwdict["_mtime_http"] = response.headers.get("Last-Modified")
else:
- kwdict["_mtime"] = None
+ kwdict["_mtime_http"] = None
return True
@@ -343,14 +372,16 @@ class HttpDownloader(DownloaderBase):
"closing the connection anyway", exc.__class__.__name__, exc)
response.close()
- @staticmethod
- def receive(fp, content, bytes_total, bytes_start):
+ def receive(self, fp, content, bytes_total, bytes_start):
write = fp.write
for data in content:
write(data)
+ if FLAGS.DOWNLOAD is not None:
+ FLAGS.process("DOWNLOAD")
+
def _receive_rate(self, fp, content, bytes_total, bytes_start):
- rate = self.rate
+ rate = self.rate() if self.rate else None
write = fp.write
progress = self.progress
@@ -363,6 +394,9 @@ class HttpDownloader(DownloaderBase):
write(data)
+ if FLAGS.DOWNLOAD is not None:
+ FLAGS.process("DOWNLOAD")
+
if progress is not None:
if time_elapsed > progress:
self.out.progress(
@@ -371,7 +405,7 @@ class HttpDownloader(DownloaderBase):
int(bytes_downloaded / time_elapsed),
)
- if rate:
+ if rate is not None:
time_expected = bytes_downloaded / rate
if time_expected > time_elapsed:
time.sleep(time_expected - time_elapsed)
@@ -387,15 +421,13 @@ class HttpDownloader(DownloaderBase):
if mtype in MIME_TYPES:
return MIME_TYPES[mtype]
- ext = mimetypes.guess_extension(mtype, strict=False)
- if ext:
+ if ext := mimetypes.guess_extension(mtype, strict=False):
return ext[1:]
self.log.warning("Unknown MIME type '%s'", mtype)
return "bin"
- @staticmethod
- def _adjust_extension(pathfmt, file_header):
+ def _adjust_extension(self, pathfmt, file_header):
"""Check filename extension against file header"""
if not SIGNATURE_CHECKS[pathfmt.extension](file_header):
for ext, check in SIGNATURE_CHECKS.items():
@@ -452,12 +484,20 @@ MIME_TYPES = {
"application/x-pdf": "pdf",
"application/x-shockwave-flash": "swf",
+ "text/html": "html",
+
"application/ogg": "ogg",
# https://www.iana.org/assignments/media-types/model/obj
"model/obj": "obj",
"application/octet-stream": "bin",
}
+
+def _signature_html(s):
+ s = s[:14].lstrip()
+ return s and b"<!doctype html".startswith(s.lower())
+
+
# https://en.wikipedia.org/wiki/List_of_file_signatures
SIGNATURE_CHECKS = {
"jpg" : lambda s: s[0:3] == b"\xFF\xD8\xFF",
@@ -488,6 +528,8 @@ SIGNATURE_CHECKS = {
"7z" : lambda s: s[0:6] == b"\x37\x7A\xBC\xAF\x27\x1C",
"pdf" : lambda s: s[0:5] == b"%PDF-",
"swf" : lambda s: s[0:3] in (b"CWS", b"FWS"),
+ "html": _signature_html,
+ "htm" : _signature_html,
"blend": lambda s: s[0:7] == b"BLENDER",
# unfortunately the Wavefront .obj format doesn't have a signature,
# so we check for the existence of Blender's comment