diff options
Diffstat (limited to 'gallery_dl/downloader')
| -rw-r--r-- | gallery_dl/downloader/common.py | 12 | ||||
| -rw-r--r-- | gallery_dl/downloader/http.py | 114 | ||||
| -rw-r--r-- | gallery_dl/downloader/ytdl.py | 52 |
3 files changed, 115 insertions, 63 deletions
diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py index dc1219f..7cd8d10 100644 --- a/gallery_dl/downloader/common.py +++ b/gallery_dl/downloader/common.py @@ -21,8 +21,7 @@ class DownloaderBase(): extractor = job.extractor self.log = job.get_logger("downloader." + self.scheme) - opts = self._extractor_config(extractor) - if opts: + if opts := self._extractor_config(extractor): self.opts = opts self.config = self.config_opts @@ -60,8 +59,7 @@ class DownloaderBase(): opts = {} for cat, sub in reversed(path): - popts = self._extractor_opts(cat, sub) - if popts: + if popts := self._extractor_opts(cat, sub): opts.update(popts) return opts @@ -70,12 +68,10 @@ class DownloaderBase(): if not cfg: return None - copts = cfg.get(self.scheme) - if copts: + if copts := cfg.get(self.scheme): if subcategory in cfg: try: - sopts = cfg[subcategory].get(self.scheme) - if sopts: + if sopts := cfg[subcategory].get(self.scheme): opts = copts.copy() opts.update(sopts) return opts diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index c58e2fb..4595483 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -12,8 +12,9 @@ import time import mimetypes from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase -from .. import text, util, output +from .. import text, util, output, exception from ssl import SSLError +FLAGS = util.FLAGS class HttpDownloader(DownloaderBase): @@ -29,6 +30,7 @@ class HttpDownloader(DownloaderBase): self.metadata = extractor.config("http-metadata") self.progress = self.config("progress", 3.0) self.validate = self.config("validate", True) + self.validate_html = self.config("validate-html", True) self.headers = self.config("headers") self.minsize = self.config("filesize-min") self.maxsize = self.config("filesize-max") @@ -68,14 +70,16 @@ class HttpDownloader(DownloaderBase): chunk_size = 32768 self.chunk_size = chunk_size if self.rate: - rate = text.parse_bytes(self.rate) - if rate: - if rate < self.chunk_size: - self.chunk_size = rate - self.rate = rate + func = util.build_selection_func(self.rate, 0, text.parse_bytes) + if rmax := func.args[1] if hasattr(func, "args") else func(): + if rmax < self.chunk_size: + # reduce chunk_size to allow for one iteration each second + self.chunk_size = rmax + self.rate = func self.receive = self._receive_rate else: self.log.warning("Invalid rate limit (%r)", self.rate) + self.rate = False if self.progress is not None: self.receive = self._receive_rate if self.progress < 0.0: @@ -88,8 +92,10 @@ class HttpDownloader(DownloaderBase): def download(self, url, pathfmt): try: return self._download_impl(url, pathfmt) - except Exception: - output.stderr_write("\n") + except Exception as exc: + if self.downloading: + output.stderr_write("\n") + self.log.debug("", exc_info=exc) raise finally: # remove file from incomplete downloads @@ -134,16 +140,14 @@ class HttpDownloader(DownloaderBase): # collect HTTP headers headers = {"Accept": "*/*"} # file-specific headers - extra = kwdict.get("_http_headers") - if extra: + if extra := kwdict.get("_http_headers"): headers.update(extra) # general headers if self.headers: headers.update(self.headers) # partial content - file_size = pathfmt.part_size() - if file_size: - headers["Range"] = "bytes={}-".format(file_size) + if file_size := pathfmt.part_size(): + headers["Range"] = f"bytes={file_size}-" # connect to (remote) source try: @@ -161,7 +165,7 @@ class HttpDownloader(DownloaderBase): reason = exc.args[0].reason cls = reason.__class__.__name__ pre, _, err = str(reason.args[-1]).partition(":") - msg = "{}: {}".format(cls, (err or pre).lstrip()) + msg = f"{cls}: {(err or pre).lstrip()}" except Exception: msg = str(exc) continue @@ -183,7 +187,7 @@ class HttpDownloader(DownloaderBase): elif code == 416 and file_size: # Requested Range Not Satisfiable break else: - msg = "'{} {}' for '{}'".format(code, response.reason, url) + msg = f"'{code} {response.reason}' for '{url}'" challenge = util.detect_challenge(response) if challenge is not None: @@ -199,8 +203,8 @@ class HttpDownloader(DownloaderBase): return False # check for invalid responses - validate = kwdict.get("_http_validate") - if validate and self.validate: + if self.validate and \ + (validate := kwdict.get("_http_validate")) is not None: try: result = validate(response) except Exception: @@ -214,6 +218,14 @@ class HttpDownloader(DownloaderBase): self.release_conn(response) self.log.warning("Invalid response") return False + if self.validate_html and response.headers.get( + "content-type", "").startswith("text/html") and \ + pathfmt.extension not in ("html", "htm"): + if response.history: + self.log.warning("HTTP redirect to '%s'", response.url) + else: + self.log.warning("HTML response") + return False # check file size size = text.parse_int(size, None) @@ -265,19 +277,28 @@ class HttpDownloader(DownloaderBase): content = response.iter_content(self.chunk_size) + validate_sig = kwdict.get("_http_signature") + validate_ext = (adjust_extension and + pathfmt.extension in SIGNATURE_CHECKS) + # check filename extension against file header - if adjust_extension and not offset and \ - pathfmt.extension in SIGNATURE_CHECKS: + if not offset and (validate_ext or validate_sig): try: file_header = next( content if response.raw.chunked else response.iter_content(16), b"") except (RequestException, SSLError) as exc: msg = str(exc) - output.stderr_write("\n") continue - if self._adjust_extension(pathfmt, file_header) and \ - pathfmt.exists(): + if validate_sig: + result = validate_sig(file_header) + if result is not True: + self.release_conn(response) + self.log.warning( + result or "Invalid file signature bytes") + return False + if validate_ext and self._adjust_extension( + pathfmt, file_header) and pathfmt.exists(): pathfmt.temppath = "" response.close() return True @@ -294,6 +315,9 @@ class HttpDownloader(DownloaderBase): # download content self.downloading = True with pathfmt.open(mode) as fp: + if fp is None: + # '.part' file no longer exists + break if file_header: fp.write(file_header) offset += len(file_header) @@ -310,11 +334,16 @@ class HttpDownloader(DownloaderBase): msg = str(exc) output.stderr_write("\n") continue + except exception.StopExtraction: + response.close() + return False + except exception.ControlException: + response.close() + raise # check file size if size and fp.tell() < size: - msg = "file size mismatch ({} < {})".format( - fp.tell(), size) + msg = f"file size mismatch ({fp.tell()} < {size})" output.stderr_write("\n") continue @@ -323,11 +352,11 @@ class HttpDownloader(DownloaderBase): self.downloading = False if self.mtime: if "_http_lastmodified" in kwdict: - kwdict["_mtime"] = kwdict["_http_lastmodified"] + kwdict["_mtime_http"] = kwdict["_http_lastmodified"] else: - kwdict["_mtime"] = response.headers.get("Last-Modified") + kwdict["_mtime_http"] = response.headers.get("Last-Modified") else: - kwdict["_mtime"] = None + kwdict["_mtime_http"] = None return True @@ -343,14 +372,16 @@ class HttpDownloader(DownloaderBase): "closing the connection anyway", exc.__class__.__name__, exc) response.close() - @staticmethod - def receive(fp, content, bytes_total, bytes_start): + def receive(self, fp, content, bytes_total, bytes_start): write = fp.write for data in content: write(data) + if FLAGS.DOWNLOAD is not None: + FLAGS.process("DOWNLOAD") + def _receive_rate(self, fp, content, bytes_total, bytes_start): - rate = self.rate + rate = self.rate() if self.rate else None write = fp.write progress = self.progress @@ -363,6 +394,9 @@ class HttpDownloader(DownloaderBase): write(data) + if FLAGS.DOWNLOAD is not None: + FLAGS.process("DOWNLOAD") + if progress is not None: if time_elapsed > progress: self.out.progress( @@ -371,7 +405,7 @@ class HttpDownloader(DownloaderBase): int(bytes_downloaded / time_elapsed), ) - if rate: + if rate is not None: time_expected = bytes_downloaded / rate if time_expected > time_elapsed: time.sleep(time_expected - time_elapsed) @@ -387,15 +421,13 @@ class HttpDownloader(DownloaderBase): if mtype in MIME_TYPES: return MIME_TYPES[mtype] - ext = mimetypes.guess_extension(mtype, strict=False) - if ext: + if ext := mimetypes.guess_extension(mtype, strict=False): return ext[1:] self.log.warning("Unknown MIME type '%s'", mtype) return "bin" - @staticmethod - def _adjust_extension(pathfmt, file_header): + def _adjust_extension(self, pathfmt, file_header): """Check filename extension against file header""" if not SIGNATURE_CHECKS[pathfmt.extension](file_header): for ext, check in SIGNATURE_CHECKS.items(): @@ -452,12 +484,20 @@ MIME_TYPES = { "application/x-pdf": "pdf", "application/x-shockwave-flash": "swf", + "text/html": "html", + "application/ogg": "ogg", # https://www.iana.org/assignments/media-types/model/obj "model/obj": "obj", "application/octet-stream": "bin", } + +def _signature_html(s): + s = s[:14].lstrip() + return s and b"<!doctype html".startswith(s.lower()) + + # https://en.wikipedia.org/wiki/List_of_file_signatures SIGNATURE_CHECKS = { "jpg" : lambda s: s[0:3] == b"\xFF\xD8\xFF", @@ -488,6 +528,8 @@ SIGNATURE_CHECKS = { "7z" : lambda s: s[0:6] == b"\x37\x7A\xBC\xAF\x27\x1C", "pdf" : lambda s: s[0:5] == b"%PDF-", "swf" : lambda s: s[0:3] in (b"CWS", b"FWS"), + "html": _signature_html, + "htm" : _signature_html, "blend": lambda s: s[0:7] == b"BLENDER", # unfortunately the Wavefront .obj format doesn't have a signature, # so we check for the existence of Blender's comment diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 1fc2f82..69a59ff 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -30,6 +30,7 @@ class YoutubeDLDownloader(DownloaderBase): } self.ytdl_instance = None + self.rate_dyn = None self.forward_cookies = self.config("forward-cookies", True) self.progress = self.config("progress", 3.0) self.outtmpl = self.config("outtmpl") @@ -67,18 +68,23 @@ class YoutubeDLDownloader(DownloaderBase): for cookie in self.session.cookies: set_cookie(cookie) - if self.progress is not None and not ytdl_instance._progress_hooks: - ytdl_instance.add_progress_hook(self._progress_hook) + if "__gdl_initialize" in ytdl_instance.params: + del ytdl_instance.params["__gdl_initialize"] + + if self.progress is not None: + ytdl_instance.add_progress_hook(self._progress_hook) + if rlf := ytdl_instance.params.pop("__gdl_ratelimit_func", False): + self.rate_dyn = rlf info_dict = kwdict.pop("_ytdl_info_dict", None) if not info_dict: url = url[5:] try: - manifest = kwdict.pop("_ytdl_manifest", None) - if manifest: + if manifest := kwdict.pop("_ytdl_manifest", None): info_dict = self._extract_manifest( ytdl_instance, url, manifest, - kwdict.pop("_ytdl_manifest_data", None)) + kwdict.pop("_ytdl_manifest_data", None), + kwdict.pop("_ytdl_manifest_headers", None)) else: info_dict = self._extract_info(ytdl_instance, url) except Exception as exc: @@ -96,8 +102,7 @@ class YoutubeDLDownloader(DownloaderBase): else: info_dict = info_dict["entries"][index] - extra = kwdict.get("_ytdl_extra") - if extra: + if extra := kwdict.get("_ytdl_extra"): info_dict.update(extra) return self._download_video(ytdl_instance, pathfmt, info_dict) @@ -131,26 +136,31 @@ class YoutubeDLDownloader(DownloaderBase): pathfmt.temppath = "" return True + if self.rate_dyn is not None: + # static ratelimits are set in ytdl.construct_YoutubeDL + ytdl_instance.params["ratelimit"] = self.rate_dyn() + self.out.start(pathfmt.path) if self.part: - pathfmt.kwdict["extension"] = pathfmt.prefix + "part" + pathfmt.kwdict["extension"] = pathfmt.prefix filename = pathfmt.build_filename(pathfmt.kwdict) pathfmt.kwdict["extension"] = info_dict["ext"] if self.partdir: path = os.path.join(self.partdir, filename) else: path = pathfmt.realdirectory + filename + path = path.replace("%", "%%") + "%(ext)s" else: - path = pathfmt.realpath + path = pathfmt.realpath.replace("%", "%%") - self._set_outtmpl(ytdl_instance, path.replace("%", "%%")) + self._set_outtmpl(ytdl_instance, path) try: ytdl_instance.process_info(info_dict) except Exception as exc: self.log.debug("", exc_info=exc) return False - pathfmt.temppath = info_dict["filepath"] + pathfmt.temppath = info_dict.get("filepath") or info_dict["_filename"] return True def _download_playlist(self, ytdl_instance, pathfmt, info_dict): @@ -159,13 +169,16 @@ class YoutubeDLDownloader(DownloaderBase): self._set_outtmpl(ytdl_instance, pathfmt.realpath) for entry in info_dict["entries"]: + if self.rate_dyn is not None: + ytdl_instance.params["ratelimit"] = self.rate_dyn() ytdl_instance.process_info(entry) return True def _extract_info(self, ytdl, url): return ytdl.extract_info(url, download=False) - def _extract_manifest(self, ytdl, url, manifest_type, manifest_data=None): + def _extract_manifest(self, ytdl, url, manifest_type, manifest_data=None, + headers=None): extr = ytdl.get_info_extractor("Generic") video_id = extr._generic_id(url) @@ -173,9 +186,10 @@ class YoutubeDLDownloader(DownloaderBase): if manifest_data is None: try: fmts, subs = extr._extract_m3u8_formats_and_subtitles( - url, video_id, "mp4") + url, video_id, "mp4", headers=headers) except AttributeError: - fmts = extr._extract_m3u8_formats(url, video_id, "mp4") + fmts = extr._extract_m3u8_formats( + url, video_id, "mp4", headers=headers) subs = None else: try: @@ -189,9 +203,10 @@ class YoutubeDLDownloader(DownloaderBase): if manifest_data is None: try: fmts, subs = extr._extract_mpd_formats_and_subtitles( - url, video_id) + url, video_id, headers=headers) except AttributeError: - fmts = extr._extract_mpd_formats(url, video_id) + fmts = extr._extract_mpd_formats( + url, video_id, headers=headers) subs = None else: if isinstance(manifest_data, str): @@ -228,8 +243,7 @@ class YoutubeDLDownloader(DownloaderBase): int(speed) if speed else 0, ) - @staticmethod - def _set_outtmpl(ytdl_instance, outtmpl): + def _set_outtmpl(self, ytdl_instance, outtmpl): try: ytdl_instance._parse_outtmpl except AttributeError: |
