diff options
Diffstat (limited to 'gallery_dl/downloader')
| -rw-r--r-- | gallery_dl/downloader/http.py | 139 | ||||
| -rw-r--r-- | gallery_dl/downloader/ytdl.py | 2 |
2 files changed, 85 insertions, 56 deletions
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 5622462..26eb7b5 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -27,10 +27,11 @@ class HttpDownloader(DownloaderBase): def __init__(self, job): DownloaderBase.__init__(self, job) extractor = job.extractor - self.chunk_size = 16384 self.downloading = False self.adjust_extension = self.config("adjust-extensions", True) + self.chunk_size = self.config("chunk-size", 32768) + self.metadata = extractor.config("http-metadata") self.progress = self.config("progress", 3.0) self.headers = self.config("headers") self.minsize = self.config("filesize-min") @@ -55,6 +56,13 @@ class HttpDownloader(DownloaderBase): self.log.warning( "Invalid maximum file size (%r)", self.maxsize) self.maxsize = maxsize + if isinstance(self.chunk_size, str): + chunk_size = text.parse_bytes(self.chunk_size) + if not chunk_size: + self.log.warning( + "Invalid chunk size (%r)", self.chunk_size) + chunk_size = 32768 + self.chunk_size = chunk_size if self.rate: rate = text.parse_bytes(self.rate) if rate: @@ -83,11 +91,12 @@ class HttpDownloader(DownloaderBase): tries = 0 msg = "" + metadata = self.metadata kwdict = pathfmt.kwdict adjust_extension = kwdict.get( "_http_adjust_extension", self.adjust_extension) - if self.part: + if self.part and not metadata: pathfmt.part_enable(self.partdir) while True: @@ -164,13 +173,6 @@ class HttpDownloader(DownloaderBase): self.log.warning("Invalid response") return False - # set missing filename extension from MIME type - if not pathfmt.extension: - pathfmt.set_extension(self._find_extension(response)) - if pathfmt.exists(): - pathfmt.temppath = "" - return True - # check file size size = text.parse_int(size, None) if size is not None: @@ -185,11 +187,33 @@ class HttpDownloader(DownloaderBase): size, self.maxsize) return False + build_path = False + + # set missing filename extension from MIME type + if not pathfmt.extension: + pathfmt.set_extension(self._find_extension(response)) + build_path = True + + # set metadata from HTTP headers + if metadata: + kwdict[metadata] = util.extract_headers(response) + build_path = True + + # build and check file path + if build_path: + pathfmt.build_path() + if pathfmt.exists(): + pathfmt.temppath = "" + return True + if self.part and metadata: + pathfmt.part_enable(self.partdir) + metadata = False + content = response.iter_content(self.chunk_size) # check filename extension against file header if adjust_extension and not offset and \ - pathfmt.extension in FILE_SIGNATURES: + pathfmt.extension in SIGNATURE_CHECKS: try: file_header = next( content if response.raw.chunked @@ -220,7 +244,7 @@ class HttpDownloader(DownloaderBase): offset += len(file_header) elif offset: if adjust_extension and \ - pathfmt.extension in FILE_SIGNATURES: + pathfmt.extension in SIGNATURE_CHECKS: self._adjust_extension(pathfmt, fp.read(16)) fp.seek(offset) @@ -250,42 +274,38 @@ class HttpDownloader(DownloaderBase): return True @staticmethod - def receive(fp, content, bytes_total, bytes_downloaded): + def receive(fp, content, bytes_total, bytes_start): write = fp.write for data in content: write(data) - def _receive_rate(self, fp, content, bytes_total, bytes_downloaded): + def _receive_rate(self, fp, content, bytes_total, bytes_start): rate = self.rate - progress = self.progress - bytes_start = bytes_downloaded write = fp.write - t1 = tstart = time.time() + progress = self.progress + + bytes_downloaded = 0 + time_start = time.time() for data in content: - write(data) + time_current = time.time() + time_elapsed = time_current - time_start + bytes_downloaded += len(data) - t2 = time.time() # current time - elapsed = t2 - t1 # elapsed time - num_bytes = len(data) + write(data) if progress is not None: - bytes_downloaded += num_bytes - tdiff = t2 - tstart - if tdiff >= progress: + if time_elapsed >= progress: self.out.progress( - bytes_total, bytes_downloaded, - int((bytes_downloaded - bytes_start) / tdiff), + bytes_total, + bytes_start + bytes_downloaded, + int(bytes_downloaded / time_elapsed), ) if rate: - expected = num_bytes / rate # expected elapsed time - if elapsed < expected: - # sleep if less time elapsed than expected - time.sleep(expected - elapsed) - t2 = time.time() - - t1 = t2 + time_expected = bytes_downloaded / rate + if time_expected > time_elapsed: + time.sleep(time_expected - time_elapsed) def _find_extension(self, response): """Get filename extension from MIME type""" @@ -308,11 +328,11 @@ class HttpDownloader(DownloaderBase): @staticmethod def _adjust_extension(pathfmt, file_header): """Check filename extension against file header""" - sig = FILE_SIGNATURES[pathfmt.extension] - if not file_header.startswith(sig): - for ext, sig in FILE_SIGNATURES.items(): - if file_header.startswith(sig): + if not SIGNATURE_CHECKS[pathfmt.extension](file_header): + for ext, check in SIGNATURE_CHECKS.items(): + if check(file_header): pathfmt.set_extension(ext) + pathfmt.build_path() return True return False @@ -326,6 +346,7 @@ MIME_TYPES = { "image/x-bmp" : "bmp", "image/x-ms-bmp": "bmp", "image/webp" : "webp", + "image/avif" : "avif", "image/svg+xml" : "svg", "image/ico" : "ico", "image/icon" : "ico", @@ -362,27 +383,33 @@ MIME_TYPES = { } # https://en.wikipedia.org/wiki/List_of_file_signatures -FILE_SIGNATURES = { - "jpg" : b"\xFF\xD8\xFF", - "png" : b"\x89PNG\r\n\x1A\n", - "gif" : (b"GIF87a", b"GIF89a"), - "bmp" : b"BM", - "webp": b"RIFF", - "svg" : b"<?xml", - "ico" : b"\x00\x00\x01\x00", - "cur" : b"\x00\x00\x02\x00", - "psd" : b"8BPS", - "webm": b"\x1A\x45\xDF\xA3", - "ogg" : b"OggS", - "wav" : b"RIFF", - "mp3" : (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2", b"ID3"), - "zip" : (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"), - "rar" : b"\x52\x61\x72\x21\x1A\x07", - "7z" : b"\x37\x7A\xBC\xAF\x27\x1C", - "pdf" : b"%PDF-", - "swf" : (b"CWS", b"FWS"), +SIGNATURE_CHECKS = { + "jpg" : lambda s: s[0:3] == b"\xFF\xD8\xFF", + "png" : lambda s: s[0:8] == b"\x89PNG\r\n\x1A\n", + "gif" : lambda s: s[0:6] in (b"GIF87a", b"GIF89a"), + "bmp" : lambda s: s[0:2] == b"BM", + "webp": lambda s: (s[0:4] == b"RIFF" and + s[8:12] == b"WEBP"), + "avif": lambda s: s[4:11] == b"ftypavi" and s[11] in b"fs", + "svg" : lambda s: s[0:5] == b"<?xml", + "ico" : lambda s: s[0:4] == b"\x00\x00\x01\x00", + "cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00", + "psd" : lambda s: s[0:4] == b"8BPS", + "mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in ( + b"mp4", b"avc", b"iso", b"M4V")), + "webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3", + "ogg" : lambda s: s[0:4] == b"OggS", + "wav" : lambda s: (s[0:4] == b"RIFF" and + s[8:12] == b"WAVE"), + "mp3" : lambda s: (s[0:3] == b"ID3" or + s[0:2] in (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2")), + "zip" : lambda s: s[0:4] in (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"), + "rar" : lambda s: s[0:6] == b"Rar!\x1A\x07", + "7z" : lambda s: s[0:6] == b"\x37\x7A\xBC\xAF\x27\x1C", + "pdf" : lambda s: s[0:5] == b"%PDF-", + "swf" : lambda s: s[0:3] in (b"CWS", b"FWS"), # check 'bin' files against all other file signatures - "bin" : b"\x00\x00\x00\x00\x00\x00\x00\x00", + "bin" : lambda s: False, } __downloader__ = HttpDownloader diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index efa957b..c44ea0a 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -98,6 +98,7 @@ class YoutubeDLDownloader(DownloaderBase): pathfmt.realdirectory + filename) else: pathfmt.set_extension(info_dict["ext"]) + pathfmt.build_path() if pathfmt.exists(): pathfmt.temppath = "" @@ -118,6 +119,7 @@ class YoutubeDLDownloader(DownloaderBase): def _download_playlist(self, ytdl_instance, pathfmt, info_dict): pathfmt.set_extension("%(playlist_index)s.%(ext)s") + pathfmt.build_path() self._set_outtmpl(ytdl_instance, pathfmt.realpath) for entry in info_dict["entries"]: |
