diff options
| author | 2020-12-13 23:07:42 -0500 | |
|---|---|---|
| committer | 2020-12-13 23:07:42 -0500 | |
| commit | 8f7c87a2697113134c311aaeafd9c919555a2741 (patch) | |
| tree | 4ff7316ac1570683b3c968fd30d044925e47a2a5 /gallery_dl/downloader | |
| parent | 143723944033d7a6593d57bd1cf6ae97713b6ce7 (diff) | |
New upstream version 1.16.0.upstream/1.16.0
Diffstat (limited to 'gallery_dl/downloader')
| -rw-r--r-- | gallery_dl/downloader/http.py | 212 |
1 files changed, 126 insertions, 86 deletions
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 0e67330..b8546a8 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -44,12 +44,14 @@ class HttpDownloader(DownloaderBase): if self.minsize: minsize = text.parse_bytes(self.minsize) if not minsize: - self.log.warning("Invalid minimum filesize (%r)", self.minsize) + self.log.warning( + "Invalid minimum file size (%r)", self.minsize) self.minsize = minsize if self.maxsize: maxsize = text.parse_bytes(self.maxsize) if not maxsize: - self.log.warning("Invalid maximum filesize (%r)", self.maxsize) + self.log.warning( + "Invalid maximum file size (%r)", self.maxsize) self.maxsize = maxsize if self.rate: rate = text.parse_bytes(self.rate) @@ -84,17 +86,20 @@ class HttpDownloader(DownloaderBase): if tries: if response: response.close() + response = None self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) if tries > self.retries: return False time.sleep(tries) - tries += 1 + tries += 1 headers = {} + file_header = None + # check for .part file - filesize = pathfmt.part_size() - if filesize: - headers["Range"] = "bytes={}-".format(filesize) + file_size = pathfmt.part_size() + if file_size: + headers["Range"] = "bytes={}-".format(file_size) # file-specific headers extra = pathfmt.kwdict.get("_http_headers") if extra: @@ -118,9 +123,9 @@ class HttpDownloader(DownloaderBase): offset = 0 size = response.headers.get("Content-Length") elif code == 206: # Partial Content - offset = filesize + offset = file_size size = response.headers["Content-Range"].rpartition("/")[2] - elif code == 416 and filesize: # Requested Range Not Satisfiable + elif code == 416 and file_size: # Requested Range Not Satisfiable break else: msg = "'{} {}' for '{}'".format(code, response.reason, url) @@ -129,7 +134,14 @@ class HttpDownloader(DownloaderBase): self.log.warning(msg) return False - # check filesize + # set missing filename extension from MIME type + if not pathfmt.extension: + pathfmt.set_extension(self._find_extension(response)) + if pathfmt.exists(): + pathfmt.temppath = "" + return True + + # check file size size = text.parse_int(size, None) if size is not None: if self.minsize and size < self.minsize: @@ -143,50 +155,59 @@ class HttpDownloader(DownloaderBase): size, self.maxsize) return False - # set missing filename extension - if not pathfmt.extension: - pathfmt.set_extension(self.get_extension(response)) - if pathfmt.exists(): + content = response.iter_content(self.chunk_size) + + # check filename extension against file header + if self.adjust_extension and not offset and \ + pathfmt.extension in FILE_SIGNATURES: + try: + file_header = next( + content if response.raw.chunked + else response.iter_content(16), b"") + except (RequestException, SSLError, OpenSSLError) as exc: + msg = str(exc) + print() + continue + if self._adjust_extension(pathfmt, file_header) and \ + pathfmt.exists(): pathfmt.temppath = "" return True # set open mode if not offset: mode = "w+b" - if filesize: + if file_size: self.log.debug("Unable to resume partial download") else: mode = "r+b" self.log.debug("Resuming download at byte %d", offset) - # start downloading - self.out.start(pathfmt.path) + # download content self.downloading = True - with pathfmt.open(mode) as file: - if offset: - file.seek(offset) - - # download content + with pathfmt.open(mode) as fp: + if file_header: + fp.write(file_header) + elif offset: + if self.adjust_extension and \ + pathfmt.extension in FILE_SIGNATURES: + self._adjust_extension(pathfmt, fp.read(16)) + fp.seek(offset) + + self.out.start(pathfmt.path) try: - self.receive(response, file) + self.receive(fp, content) except (RequestException, SSLError, OpenSSLError) as exc: msg = str(exc) print() continue - # check filesize - if size and file.tell() < size: - msg = "filesize mismatch ({} < {})".format( - file.tell(), size) + # check file size + if size and fp.tell() < size: + msg = "file size mismatch ({} < {})".format( + fp.tell(), size) print() continue - # check filename extension - if self.adjust_extension: - adj_ext = self.check_extension(file, pathfmt.extension) - if adj_ext: - pathfmt.set_extension(adj_ext) - break self.downloading = False @@ -198,16 +219,18 @@ class HttpDownloader(DownloaderBase): return True - def receive(self, response, file): - for data in response.iter_content(self.chunk_size): - file.write(data) + @staticmethod + def receive(fp, content): + write = fp.write + for data in content: + write(data) - def _receive_rate(self, response, file): - t1 = time.time() + def _receive_rate(self, fp, content): rt = self.rate + t1 = time.time() - for data in response.iter_content(self.chunk_size): - file.write(data) + for data in content: + fp.write(data) t2 = time.time() # current time actual = t2 - t1 # actual elapsed time @@ -220,81 +243,98 @@ class HttpDownloader(DownloaderBase): else: t1 = t2 - def get_extension(self, response): + def _find_extension(self, response): + """Get filename extension from MIME type""" mtype = response.headers.get("Content-Type", "image/jpeg") mtype = mtype.partition(";")[0] if "/" not in mtype: mtype = "image/" + mtype - if mtype in MIMETYPE_MAP: - return MIMETYPE_MAP[mtype] + if mtype in MIME_TYPES: + return MIME_TYPES[mtype] - exts = mimetypes.guess_all_extensions(mtype, strict=False) - if exts: - exts.sort() - return exts[-1][1:] + ext = mimetypes.guess_extension(mtype, strict=False) + if ext: + return ext[1:] - self.log.warning( - "No filename extension found for MIME type '%s'", mtype) - return "txt" + self.log.warning("Unknown MIME type '%s'", mtype) + return "bin" @staticmethod - def check_extension(file, extension): - """Check filename extension against fileheader""" - if extension in FILETYPE_CHECK: - file.seek(0) - header = file.read(8) - if len(header) >= 8 and not FILETYPE_CHECK[extension](header): - for ext, check in FILETYPE_CHECK.items(): - if ext != extension and check(header): - return ext - return None - - -FILETYPE_CHECK = { - "jpg": lambda h: h[0:2] == b"\xff\xd8", - "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", - "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97, -} + def _adjust_extension(pathfmt, file_header): + """Check filename extension against file header""" + sig = FILE_SIGNATURES[pathfmt.extension] + if not file_header.startswith(sig): + for ext, sig in FILE_SIGNATURES.items(): + if file_header.startswith(sig): + pathfmt.set_extension(ext) + return True + return False -MIMETYPE_MAP = { - "image/jpeg": "jpg", - "image/jpg": "jpg", - "image/png": "png", - "image/gif": "gif", - "image/bmp": "bmp", - "image/x-bmp": "bmp", +MIME_TYPES = { + "image/jpeg" : "jpg", + "image/jpg" : "jpg", + "image/png" : "png", + "image/gif" : "gif", + "image/bmp" : "bmp", + "image/x-bmp" : "bmp", "image/x-ms-bmp": "bmp", - "image/webp": "webp", - "image/svg+xml": "svg", + "image/webp" : "webp", + "image/svg+xml" : "svg", + "image/x-photoshop" : "psd", + "application/x-photoshop" : "psd", "image/vnd.adobe.photoshop": "psd", - "image/x-photoshop": "psd", - "application/x-photoshop": "psd", "video/webm": "webm", - "video/ogg": "ogg", - "video/mp4": "mp4", + "video/ogg" : "ogg", + "video/mp4" : "mp4", - "audio/wav": "wav", + "audio/wav" : "wav", "audio/x-wav": "wav", - "audio/webm": "webm", - "audio/ogg": "ogg", - "audio/mpeg": "mp3", + "audio/webm" : "webm", + "audio/ogg" : "ogg", + "audio/mpeg" : "mp3", - "application/zip": "zip", + "application/zip" : "zip", "application/x-zip": "zip", "application/x-zip-compressed": "zip", - "application/rar": "rar", + "application/rar" : "rar", "application/x-rar": "rar", "application/x-rar-compressed": "rar", - "application/x-7z-compressed": "7z", + "application/x-7z-compressed" : "7z", + + "application/pdf" : "pdf", + "application/x-pdf": "pdf", + "application/x-shockwave-flash": "swf", "application/ogg": "ogg", "application/octet-stream": "bin", } +# taken from https://en.wikipedia.org/wiki/List_of_file_signatures +FILE_SIGNATURES = { + "jpg" : b"\xFF\xD8\xFF", + "png" : b"\x89PNG\r\n\x1A\n", + "gif" : (b"GIF87a", b"GIF89a"), + "bmp" : b"BM", + "webp": b"RIFF", + "svg" : b"<?xml", + "psd" : b"8BPS", + "webm": b"\x1A\x45\xDF\xA3", + "ogg" : b"OggS", + "wav" : b"RIFF", + "mp3" : (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2", b"ID3"), + "zip" : (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"), + "rar" : b"\x52\x61\x72\x21\x1A\x07", + "7z" : b"\x37\x7A\xBC\xAF\x27\x1C", + "pdf" : b"%PDF-", + "swf" : (b"CWS", b"FWS"), + # check 'bin' files against all other file signatures + "bin" : b"\x00\x00\x00\x00", +} + __downloader__ = HttpDownloader |
