summaryrefslogtreecommitdiffstats
path: root/gallery_dl/downloader
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/downloader')
-rw-r--r--gallery_dl/downloader/common.py12
-rw-r--r--gallery_dl/downloader/http.py114
-rw-r--r--gallery_dl/downloader/ytdl.py52
3 files changed, 115 insertions, 63 deletions
diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py
index dc1219f..7cd8d10 100644
--- a/gallery_dl/downloader/common.py
+++ b/gallery_dl/downloader/common.py
@@ -21,8 +21,7 @@ class DownloaderBase():
extractor = job.extractor
self.log = job.get_logger("downloader." + self.scheme)
- opts = self._extractor_config(extractor)
- if opts:
+ if opts := self._extractor_config(extractor):
self.opts = opts
self.config = self.config_opts
@@ -60,8 +59,7 @@ class DownloaderBase():
opts = {}
for cat, sub in reversed(path):
- popts = self._extractor_opts(cat, sub)
- if popts:
+ if popts := self._extractor_opts(cat, sub):
opts.update(popts)
return opts
@@ -70,12 +68,10 @@ class DownloaderBase():
if not cfg:
return None
- copts = cfg.get(self.scheme)
- if copts:
+ if copts := cfg.get(self.scheme):
if subcategory in cfg:
try:
- sopts = cfg[subcategory].get(self.scheme)
- if sopts:
+ if sopts := cfg[subcategory].get(self.scheme):
opts = copts.copy()
opts.update(sopts)
return opts
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index c58e2fb..4595483 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2023 Mike Fährmann
+# Copyright 2014-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,8 +12,9 @@ import time
import mimetypes
from requests.exceptions import RequestException, ConnectionError, Timeout
from .common import DownloaderBase
-from .. import text, util, output
+from .. import text, util, output, exception
from ssl import SSLError
+FLAGS = util.FLAGS
class HttpDownloader(DownloaderBase):
@@ -29,6 +30,7 @@ class HttpDownloader(DownloaderBase):
self.metadata = extractor.config("http-metadata")
self.progress = self.config("progress", 3.0)
self.validate = self.config("validate", True)
+ self.validate_html = self.config("validate-html", True)
self.headers = self.config("headers")
self.minsize = self.config("filesize-min")
self.maxsize = self.config("filesize-max")
@@ -68,14 +70,16 @@ class HttpDownloader(DownloaderBase):
chunk_size = 32768
self.chunk_size = chunk_size
if self.rate:
- rate = text.parse_bytes(self.rate)
- if rate:
- if rate < self.chunk_size:
- self.chunk_size = rate
- self.rate = rate
+ func = util.build_selection_func(self.rate, 0, text.parse_bytes)
+ if rmax := func.args[1] if hasattr(func, "args") else func():
+ if rmax < self.chunk_size:
+ # reduce chunk_size to allow for one iteration each second
+ self.chunk_size = rmax
+ self.rate = func
self.receive = self._receive_rate
else:
self.log.warning("Invalid rate limit (%r)", self.rate)
+ self.rate = False
if self.progress is not None:
self.receive = self._receive_rate
if self.progress < 0.0:
@@ -88,8 +92,10 @@ class HttpDownloader(DownloaderBase):
def download(self, url, pathfmt):
try:
return self._download_impl(url, pathfmt)
- except Exception:
- output.stderr_write("\n")
+ except Exception as exc:
+ if self.downloading:
+ output.stderr_write("\n")
+ self.log.debug("", exc_info=exc)
raise
finally:
# remove file from incomplete downloads
@@ -134,16 +140,14 @@ class HttpDownloader(DownloaderBase):
# collect HTTP headers
headers = {"Accept": "*/*"}
# file-specific headers
- extra = kwdict.get("_http_headers")
- if extra:
+ if extra := kwdict.get("_http_headers"):
headers.update(extra)
# general headers
if self.headers:
headers.update(self.headers)
# partial content
- file_size = pathfmt.part_size()
- if file_size:
- headers["Range"] = "bytes={}-".format(file_size)
+ if file_size := pathfmt.part_size():
+ headers["Range"] = f"bytes={file_size}-"
# connect to (remote) source
try:
@@ -161,7 +165,7 @@ class HttpDownloader(DownloaderBase):
reason = exc.args[0].reason
cls = reason.__class__.__name__
pre, _, err = str(reason.args[-1]).partition(":")
- msg = "{}: {}".format(cls, (err or pre).lstrip())
+ msg = f"{cls}: {(err or pre).lstrip()}"
except Exception:
msg = str(exc)
continue
@@ -183,7 +187,7 @@ class HttpDownloader(DownloaderBase):
elif code == 416 and file_size: # Requested Range Not Satisfiable
break
else:
- msg = "'{} {}' for '{}'".format(code, response.reason, url)
+ msg = f"'{code} {response.reason}' for '{url}'"
challenge = util.detect_challenge(response)
if challenge is not None:
@@ -199,8 +203,8 @@ class HttpDownloader(DownloaderBase):
return False
# check for invalid responses
- validate = kwdict.get("_http_validate")
- if validate and self.validate:
+ if self.validate and \
+ (validate := kwdict.get("_http_validate")) is not None:
try:
result = validate(response)
except Exception:
@@ -214,6 +218,14 @@ class HttpDownloader(DownloaderBase):
self.release_conn(response)
self.log.warning("Invalid response")
return False
+ if self.validate_html and response.headers.get(
+ "content-type", "").startswith("text/html") and \
+ pathfmt.extension not in ("html", "htm"):
+ if response.history:
+ self.log.warning("HTTP redirect to '%s'", response.url)
+ else:
+ self.log.warning("HTML response")
+ return False
# check file size
size = text.parse_int(size, None)
@@ -265,19 +277,28 @@ class HttpDownloader(DownloaderBase):
content = response.iter_content(self.chunk_size)
+ validate_sig = kwdict.get("_http_signature")
+ validate_ext = (adjust_extension and
+ pathfmt.extension in SIGNATURE_CHECKS)
+
# check filename extension against file header
- if adjust_extension and not offset and \
- pathfmt.extension in SIGNATURE_CHECKS:
+ if not offset and (validate_ext or validate_sig):
try:
file_header = next(
content if response.raw.chunked
else response.iter_content(16), b"")
except (RequestException, SSLError) as exc:
msg = str(exc)
- output.stderr_write("\n")
continue
- if self._adjust_extension(pathfmt, file_header) and \
- pathfmt.exists():
+ if validate_sig:
+ result = validate_sig(file_header)
+ if result is not True:
+ self.release_conn(response)
+ self.log.warning(
+ result or "Invalid file signature bytes")
+ return False
+ if validate_ext and self._adjust_extension(
+ pathfmt, file_header) and pathfmt.exists():
pathfmt.temppath = ""
response.close()
return True
@@ -294,6 +315,9 @@ class HttpDownloader(DownloaderBase):
# download content
self.downloading = True
with pathfmt.open(mode) as fp:
+ if fp is None:
+ # '.part' file no longer exists
+ break
if file_header:
fp.write(file_header)
offset += len(file_header)
@@ -310,11 +334,16 @@ class HttpDownloader(DownloaderBase):
msg = str(exc)
output.stderr_write("\n")
continue
+ except exception.StopExtraction:
+ response.close()
+ return False
+ except exception.ControlException:
+ response.close()
+ raise
# check file size
if size and fp.tell() < size:
- msg = "file size mismatch ({} < {})".format(
- fp.tell(), size)
+ msg = f"file size mismatch ({fp.tell()} < {size})"
output.stderr_write("\n")
continue
@@ -323,11 +352,11 @@ class HttpDownloader(DownloaderBase):
self.downloading = False
if self.mtime:
if "_http_lastmodified" in kwdict:
- kwdict["_mtime"] = kwdict["_http_lastmodified"]
+ kwdict["_mtime_http"] = kwdict["_http_lastmodified"]
else:
- kwdict["_mtime"] = response.headers.get("Last-Modified")
+ kwdict["_mtime_http"] = response.headers.get("Last-Modified")
else:
- kwdict["_mtime"] = None
+ kwdict["_mtime_http"] = None
return True
@@ -343,14 +372,16 @@ class HttpDownloader(DownloaderBase):
"closing the connection anyway", exc.__class__.__name__, exc)
response.close()
- @staticmethod
- def receive(fp, content, bytes_total, bytes_start):
+ def receive(self, fp, content, bytes_total, bytes_start):
write = fp.write
for data in content:
write(data)
+ if FLAGS.DOWNLOAD is not None:
+ FLAGS.process("DOWNLOAD")
+
def _receive_rate(self, fp, content, bytes_total, bytes_start):
- rate = self.rate
+ rate = self.rate() if self.rate else None
write = fp.write
progress = self.progress
@@ -363,6 +394,9 @@ class HttpDownloader(DownloaderBase):
write(data)
+ if FLAGS.DOWNLOAD is not None:
+ FLAGS.process("DOWNLOAD")
+
if progress is not None:
if time_elapsed > progress:
self.out.progress(
@@ -371,7 +405,7 @@ class HttpDownloader(DownloaderBase):
int(bytes_downloaded / time_elapsed),
)
- if rate:
+ if rate is not None:
time_expected = bytes_downloaded / rate
if time_expected > time_elapsed:
time.sleep(time_expected - time_elapsed)
@@ -387,15 +421,13 @@ class HttpDownloader(DownloaderBase):
if mtype in MIME_TYPES:
return MIME_TYPES[mtype]
- ext = mimetypes.guess_extension(mtype, strict=False)
- if ext:
+ if ext := mimetypes.guess_extension(mtype, strict=False):
return ext[1:]
self.log.warning("Unknown MIME type '%s'", mtype)
return "bin"
- @staticmethod
- def _adjust_extension(pathfmt, file_header):
+ def _adjust_extension(self, pathfmt, file_header):
"""Check filename extension against file header"""
if not SIGNATURE_CHECKS[pathfmt.extension](file_header):
for ext, check in SIGNATURE_CHECKS.items():
@@ -452,12 +484,20 @@ MIME_TYPES = {
"application/x-pdf": "pdf",
"application/x-shockwave-flash": "swf",
+ "text/html": "html",
+
"application/ogg": "ogg",
# https://www.iana.org/assignments/media-types/model/obj
"model/obj": "obj",
"application/octet-stream": "bin",
}
+
+def _signature_html(s):
+ s = s[:14].lstrip()
+ return s and b"<!doctype html".startswith(s.lower())
+
+
# https://en.wikipedia.org/wiki/List_of_file_signatures
SIGNATURE_CHECKS = {
"jpg" : lambda s: s[0:3] == b"\xFF\xD8\xFF",
@@ -488,6 +528,8 @@ SIGNATURE_CHECKS = {
"7z" : lambda s: s[0:6] == b"\x37\x7A\xBC\xAF\x27\x1C",
"pdf" : lambda s: s[0:5] == b"%PDF-",
"swf" : lambda s: s[0:3] in (b"CWS", b"FWS"),
+ "html": _signature_html,
+ "htm" : _signature_html,
"blend": lambda s: s[0:7] == b"BLENDER",
# unfortunately the Wavefront .obj format doesn't have a signature,
# so we check for the existence of Blender's comment
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py
index 1fc2f82..69a59ff 100644
--- a/gallery_dl/downloader/ytdl.py
+++ b/gallery_dl/downloader/ytdl.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2022 Mike Fährmann
+# Copyright 2018-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -30,6 +30,7 @@ class YoutubeDLDownloader(DownloaderBase):
}
self.ytdl_instance = None
+ self.rate_dyn = None
self.forward_cookies = self.config("forward-cookies", True)
self.progress = self.config("progress", 3.0)
self.outtmpl = self.config("outtmpl")
@@ -67,18 +68,23 @@ class YoutubeDLDownloader(DownloaderBase):
for cookie in self.session.cookies:
set_cookie(cookie)
- if self.progress is not None and not ytdl_instance._progress_hooks:
- ytdl_instance.add_progress_hook(self._progress_hook)
+ if "__gdl_initialize" in ytdl_instance.params:
+ del ytdl_instance.params["__gdl_initialize"]
+
+ if self.progress is not None:
+ ytdl_instance.add_progress_hook(self._progress_hook)
+ if rlf := ytdl_instance.params.pop("__gdl_ratelimit_func", False):
+ self.rate_dyn = rlf
info_dict = kwdict.pop("_ytdl_info_dict", None)
if not info_dict:
url = url[5:]
try:
- manifest = kwdict.pop("_ytdl_manifest", None)
- if manifest:
+ if manifest := kwdict.pop("_ytdl_manifest", None):
info_dict = self._extract_manifest(
ytdl_instance, url, manifest,
- kwdict.pop("_ytdl_manifest_data", None))
+ kwdict.pop("_ytdl_manifest_data", None),
+ kwdict.pop("_ytdl_manifest_headers", None))
else:
info_dict = self._extract_info(ytdl_instance, url)
except Exception as exc:
@@ -96,8 +102,7 @@ class YoutubeDLDownloader(DownloaderBase):
else:
info_dict = info_dict["entries"][index]
- extra = kwdict.get("_ytdl_extra")
- if extra:
+ if extra := kwdict.get("_ytdl_extra"):
info_dict.update(extra)
return self._download_video(ytdl_instance, pathfmt, info_dict)
@@ -131,26 +136,31 @@ class YoutubeDLDownloader(DownloaderBase):
pathfmt.temppath = ""
return True
+ if self.rate_dyn is not None:
+ # static ratelimits are set in ytdl.construct_YoutubeDL
+ ytdl_instance.params["ratelimit"] = self.rate_dyn()
+
self.out.start(pathfmt.path)
if self.part:
- pathfmt.kwdict["extension"] = pathfmt.prefix + "part"
+ pathfmt.kwdict["extension"] = pathfmt.prefix
filename = pathfmt.build_filename(pathfmt.kwdict)
pathfmt.kwdict["extension"] = info_dict["ext"]
if self.partdir:
path = os.path.join(self.partdir, filename)
else:
path = pathfmt.realdirectory + filename
+ path = path.replace("%", "%%") + "%(ext)s"
else:
- path = pathfmt.realpath
+ path = pathfmt.realpath.replace("%", "%%")
- self._set_outtmpl(ytdl_instance, path.replace("%", "%%"))
+ self._set_outtmpl(ytdl_instance, path)
try:
ytdl_instance.process_info(info_dict)
except Exception as exc:
self.log.debug("", exc_info=exc)
return False
- pathfmt.temppath = info_dict["filepath"]
+ pathfmt.temppath = info_dict.get("filepath") or info_dict["_filename"]
return True
def _download_playlist(self, ytdl_instance, pathfmt, info_dict):
@@ -159,13 +169,16 @@ class YoutubeDLDownloader(DownloaderBase):
self._set_outtmpl(ytdl_instance, pathfmt.realpath)
for entry in info_dict["entries"]:
+ if self.rate_dyn is not None:
+ ytdl_instance.params["ratelimit"] = self.rate_dyn()
ytdl_instance.process_info(entry)
return True
def _extract_info(self, ytdl, url):
return ytdl.extract_info(url, download=False)
- def _extract_manifest(self, ytdl, url, manifest_type, manifest_data=None):
+ def _extract_manifest(self, ytdl, url, manifest_type, manifest_data=None,
+ headers=None):
extr = ytdl.get_info_extractor("Generic")
video_id = extr._generic_id(url)
@@ -173,9 +186,10 @@ class YoutubeDLDownloader(DownloaderBase):
if manifest_data is None:
try:
fmts, subs = extr._extract_m3u8_formats_and_subtitles(
- url, video_id, "mp4")
+ url, video_id, "mp4", headers=headers)
except AttributeError:
- fmts = extr._extract_m3u8_formats(url, video_id, "mp4")
+ fmts = extr._extract_m3u8_formats(
+ url, video_id, "mp4", headers=headers)
subs = None
else:
try:
@@ -189,9 +203,10 @@ class YoutubeDLDownloader(DownloaderBase):
if manifest_data is None:
try:
fmts, subs = extr._extract_mpd_formats_and_subtitles(
- url, video_id)
+ url, video_id, headers=headers)
except AttributeError:
- fmts = extr._extract_mpd_formats(url, video_id)
+ fmts = extr._extract_mpd_formats(
+ url, video_id, headers=headers)
subs = None
else:
if isinstance(manifest_data, str):
@@ -228,8 +243,7 @@ class YoutubeDLDownloader(DownloaderBase):
int(speed) if speed else 0,
)
- @staticmethod
- def _set_outtmpl(ytdl_instance, outtmpl):
+ def _set_outtmpl(self, ytdl_instance, outtmpl):
try:
ytdl_instance._parse_outtmpl
except AttributeError: