From 3338dfce719c999467ffe08fd45663be8190057a Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Sun, 4 Dec 2022 23:25:06 -0500 Subject: New upstream version 1.24.1. --- gallery_dl/downloader/http.py | 16 +- gallery_dl/extractor/2chen.py | 16 +- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/artstation.py | 11 +- gallery_dl/extractor/bcy.py | 16 +- gallery_dl/extractor/bunkr.py | 8 +- gallery_dl/extractor/danbooru.py | 1 - gallery_dl/extractor/fapachi.py | 85 ++++++++++ gallery_dl/extractor/hitomi.py | 13 +- gallery_dl/extractor/hotleak.py | 1 + gallery_dl/extractor/imagehosts.py | 19 +++ gallery_dl/extractor/inkbunny.py | 36 ++++- gallery_dl/extractor/itaku.py | 10 +- gallery_dl/extractor/lolisafe.py | 22 +-- gallery_dl/extractor/nitter.py | 314 +++++++++++++++++++++++++++++++------ gallery_dl/extractor/patreon.py | 70 ++++++--- gallery_dl/extractor/pixiv.py | 2 +- gallery_dl/extractor/reddit.py | 5 + gallery_dl/extractor/twitter.py | 38 ++--- gallery_dl/extractor/weibo.py | 45 ++++-- gallery_dl/formatter.py | 15 ++ gallery_dl/job.py | 10 +- gallery_dl/path.py | 2 +- gallery_dl/version.py | 2 +- 24 files changed, 596 insertions(+), 162 deletions(-) create mode 100644 gallery_dl/extractor/fapachi.py (limited to 'gallery_dl') diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 26eb7b5..4037420 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -37,6 +37,7 @@ class HttpDownloader(DownloaderBase): self.minsize = self.config("filesize-min") self.maxsize = self.config("filesize-max") self.retries = self.config("retries", extractor._retries) + self.retry_codes = self.config("retry-codes") self.timeout = self.config("timeout", extractor._timeout) self.verify = self.config("verify", extractor._verify) self.mtime = self.config("mtime", True) @@ -44,6 +45,8 @@ class HttpDownloader(DownloaderBase): if self.retries < 0: self.retries = float("inf") + if self.retry_codes is None: + self.retry_codes = [429] if self.minsize: minsize = text.parse_bytes(self.minsize) if not minsize: @@ -74,6 +77,8 @@ class HttpDownloader(DownloaderBase): self.log.warning("Invalid rate limit (%r)", self.rate) if self.progress is not None: self.receive = self._receive_rate + if self.progress < 0.0: + self.progress = 0.0 def download(self, url, pathfmt): try: @@ -96,6 +101,13 @@ class HttpDownloader(DownloaderBase): adjust_extension = kwdict.get( "_http_adjust_extension", self.adjust_extension) + codes = kwdict.get("_http_retry_codes") + if codes: + retry_codes = self.retry_codes.copy() + retry_codes += codes + else: + retry_codes = self.retry_codes + if self.part and not metadata: pathfmt.part_enable(self.partdir) @@ -156,7 +168,7 @@ class HttpDownloader(DownloaderBase): break else: msg = "'{} {}' for '{}'".format(code, response.reason, url) - if code == 429 or 500 <= code < 600: # Server Error + if code in retry_codes or 500 <= code < 600: continue self.log.warning(msg) return False @@ -295,7 +307,7 @@ class HttpDownloader(DownloaderBase): write(data) if progress is not None: - if time_elapsed >= progress: + if time_elapsed > progress: self.out.progress( bytes_total, bytes_start + bytes_downloaded, diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py index 8fffeb0..76a085a 100644 --- a/gallery_dl/extractor/2chen.py +++ b/gallery_dl/extractor/2chen.py @@ -16,13 +16,15 @@ class _2chenThreadExtractor(Extractor): subcategory = "thread" directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{time} {filename}.{extension}" - archive_fmt = "{board}_{thread}_{hash}" + archive_fmt = "{board}_{thread}_{hash}_{time}" root = "https://2chen.moe" pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)" test = ( - ("https://2chen.moe/jp/303786", { - "count": ">= 10", + ("https://2chen.moe/tv/496715", { + "count": ">= 179", }), + # 404 + ("https://2chen.moe/jp/303786"), ) def __init__(self, match): @@ -31,7 +33,7 @@ class _2chenThreadExtractor(Extractor): def items(self): url = "{}/{}/{}".format(self.root, self.board, self.thread) - page = self.request(url, encoding="utf-8").text + page = self.request(url, encoding="utf-8", notfound="thread").text data = self.metadata(page) yield Message.Directory, data for post in self.posts(page): @@ -66,7 +68,7 @@ class _2chenThreadExtractor(Extractor): "%d %b %Y (%a) %H:%M:%S" ), "no" : extr('href="#p', '"'), - "url" : extr('06}." + ext).format delays = data["Ugoira:FrameDelays"] return [{"file": fmt(index), "delay": delay} diff --git a/gallery_dl/extractor/fapachi.py b/gallery_dl/extractor/fapachi.py new file mode 100644 index 0000000..ee6d15a --- /dev/null +++ b/gallery_dl/extractor/fapachi.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://fapachi.com/""" + +from .common import Extractor, Message +from .. import text + + +class FapachiPostExtractor(Extractor): + """Extractor for individual posts on fapachi.com""" + category = "fapachi" + subcategory = "post" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{user}_{id}.{extension}" + archive_fmt = "{user}_{id}" + pattern = (r"(?:https?://)?(?:www\.)?fapachi\.com" + r"/(?!search/)([^/?#]+)/media/(\d+)") + root = "https://fapachi.com" + test = ( + # NSFW + ("https://fapachi.com/sonson/media/0082", { + "pattern": (r"https://fapachi\.com/models/s/o/" + r"sonson/1/full/sonson_0082\.jpeg"), + "keyword": { + "user": "sonson", + "id" : "0082", + }, + }), + # NSFW + ("https://fapachi.com/ferxiita/media/0159"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user, self.id = match.groups() + + def items(self): + data = { + "user": self.user, + "id" : self.id, + } + page = self.request("{}/{}/media/{}".format( + self.root, self.user, self.id)).text + url = self.root + text.extr(page, 'd-block" src="', '"') + yield Message.Directory, data + yield Message.Url, url, text.nameext_from_url(url, data) + + +class FapachiUserExtractor(Extractor): + """Extractor for all posts from a fapachi user""" + category = "fapachi" + subcategory = "user" + pattern = (r"(?:https?://)?(?:www\.)?fapachi\.com" + r"/(?!search(?:/|$))([^/?#]+)(?:/page/(\d+))?$") + root = "https://fapachi.com" + test = ( + ("https://fapachi.com/sonson", { + "pattern": FapachiPostExtractor.pattern, + "range" : "1-50", + "count" : 50, + }), + ("https://fapachi.com/ferxiita/page/3"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + self.num = text.parse_int(match.group(2), 1) + + def items(self): + data = {"_extractor": FapachiPostExtractor} + while True: + page = self.request("{}/{}/page/{}".format( + self.root, self.user, self.num)).text + for post in text.extract_iter(page, 'model-media-prew">', ">"): + url = self.root + text.extr(post, 'Next page' not in page: + return + self.num += 1 diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index cc110aa..44459ce 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -115,16 +115,17 @@ class HitomiGalleryExtractor(GalleryExtractor): fmt = self.config("format") or "webp" if fmt == "original": - subdomain, fmt, ext, check = "b", "images", None, False + subdomain, path, ext, check = "b", "images", None, False else: - subdomain, ext, check = "a", fmt, True + subdomain, path, ext, check = "a", fmt, fmt, (fmt != "webp") result = [] for image in self.info["files"]: if check: - if not image.get("has" + fmt): - fmt = ext = "webp" - check = False + if image.get("has" + fmt): + path = ext = fmt + else: + path = ext = "webp" ihash = image["hash"] idata = text.nameext_from_url(image["name"]) if ext: @@ -134,7 +135,7 @@ class HitomiGalleryExtractor(GalleryExtractor): inum = int(ihash[-1] + ihash[-3:-1], 16) url = "https://{}{}.hitomi.la/{}/{}/{}/{}.{}".format( chr(97 + gg_m.get(inum, gg_default)), - subdomain, fmt, gg_b, inum, ihash, idata["extension"], + subdomain, path, gg_b, inum, ihash, idata["extension"], ) result.append((url, idata)) return result diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index 01ad38c..eb64db0 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -146,6 +146,7 @@ class HotleakCreatorExtractor(HotleakExtractor): self.wait( until=exc.response.headers.get("X-RateLimit-Reset")) continue + raise posts = response.json() if not posts: diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 622509f..6fcfc55 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -242,6 +242,25 @@ class PixhostImageExtractor(ImagehostImageExtractor): return url, filename +class PixhostGalleryExtractor(ImagehostImageExtractor): + """Extractor for image galleries from pixhost.to""" + category = "pixhost" + subcategory = "gallery" + pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)" + r"/gallery/([^/?#]+))") + test = ("https://pixhost.to/gallery/jSMFq", { + "pattern": PixhostImageExtractor.pattern, + "count": 3, + }) + + def items(self): + page = text.extr(self.request( + self.page_url).text, 'class="images"', "") + data = {"_extractor": PixhostImageExtractor} + for url in text.extract_iter(page, '")[2], - "_attach": extr('class="attachments', 'class="tweet-stats'), + "_attach" : extr('class="attachments', 'class="tweet-stats'), "comments": text.parse_int(extr( 'class="icon-comment', '').rpartition(">")[2]), "retweets": text.parse_int(extr( @@ -73,17 +125,87 @@ class NitterExtractor(BaseExtractor): 'class="icon-quote', '').rpartition(">")[2]), "likes" : text.parse_int(extr( 'class="icon-heart', '').rpartition(">")[2]), + "retweet" : 'class="retweet-header' in html, + "quoted": False, + } + + def _tweet_from_quote(self, html): + extr = text.extract_from(html) + author = { + "name": extr('class="fullname" href="/', '"'), + "nick": extr('title="', '"'), } + extr('")[2], + "_attach" : extr('class="attachments', ''' + '''), + "retweet" : False, + "quoted": True, + } + + def _user_from_html(self, html): + extr = text.extract_from(html, html.index('class="profile-tabs')) + banner = extr('class="profile-banner">', '<'), + "date" : text.parse_datetime( + extr('class="profile-joindate">', '<').replace(",", "")), + "friends_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "followers_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "favourites_count": text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "verified" : 'title="Verified account"' in html, + } + + def _extract_quote(self, html): + html, _, quote = html.partition('class="quote') + if quote: + quote, _, tail = quote.partition('class="tweet-published') + return (html + tail, quote) + return (html, None) def _pagination(self, path): - base_url = url = self.root + path + quoted = self.config("quoted", False) + + if self.user_id: + self.user = self.request( + "{}/i/user/{}".format(self.root, self.user_id), + allow_redirects=False, + ).headers["location"].rpartition("/")[2] + base_url = url = "{}/{}{}".format(self.root, self.user, path) while True: - page = self.request(url).text + tweets_html = self.request(url).text.split( + '