From f6877087773089220d68288d055276fca6c556d4 Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Sun, 8 Dec 2024 20:34:33 -0500 Subject: New upstream version 1.28.1. --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/bluesky.py | 20 +++- gallery_dl/extractor/common.py | 6 +- gallery_dl/extractor/danbooru.py | 23 ++--- gallery_dl/extractor/gelbooru_v02.py | 64 +------------ gallery_dl/extractor/gofile.py | 4 +- gallery_dl/extractor/hentaicosplays.py | 45 ++++++--- gallery_dl/extractor/inkbunny.py | 2 +- gallery_dl/extractor/instagram.py | 4 +- gallery_dl/extractor/kemonoparty.py | 3 +- gallery_dl/extractor/nhentai.py | 16 ++-- gallery_dl/extractor/patreon.py | 27 ++++-- gallery_dl/extractor/pixiv.py | 16 +++- gallery_dl/extractor/readcomiconline.py | 41 ++++++--- gallery_dl/extractor/realbooru.py | 157 ++++++++++++++++++++++++++++++++ gallery_dl/extractor/zerochan.py | 21 ++++- gallery_dl/version.py | 2 +- 17 files changed, 319 insertions(+), 133 deletions(-) create mode 100644 gallery_dl/extractor/realbooru.py (limited to 'gallery_dl') diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 594ce41..8d5f3d0 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -140,6 +140,7 @@ modules = [ "postmill", "reactor", "readcomiconline", + "realbooru", "reddit", "redgifs", "rule34us", diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index bbff17c..f60ea15 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -75,10 +75,13 @@ class BlueskyExtractor(Extractor): quote = embed["record"] if "record" in quote: quote = quote["record"] + value = quote.pop("value", None) + if value is None: + break quote["quote_id"] = self._pid(post) quote["quote_by"] = post["author"] embed = quote.get("embed") - quote.update(quote.pop("value")) + quote.update(value) post = quote def posts(self): @@ -202,6 +205,7 @@ class BlueskyUserExtractor(BlueskyExtractor): def items(self): base = "{}/profile/{}/".format(self.root, self.user) return self._dispatch_extractors(( + (BlueskyInfoExtractor , base + "info"), (BlueskyAvatarExtractor , base + "avatar"), (BlueskyBackgroundExtractor, base + "banner"), (BlueskyPostsExtractor , base + "posts"), @@ -298,6 +302,17 @@ class BlueskyPostExtractor(BlueskyExtractor): return self.api.get_post_thread(self.user, self.post_id) +class BlueskyInfoExtractor(BlueskyExtractor): + subcategory = "info" + pattern = USER_PATTERN + r"/info" + example = "https://bsky.app/profile/HANDLE/info" + + def items(self): + self._metadata_user = True + self.api._did_from_actor(self.user) + return iter(((Message.Directory, self._user),)) + + class BlueskyAvatarExtractor(BlueskyExtractor): subcategory = "avatar" filename_fmt = "avatar_{post_id}.{extension}" @@ -324,7 +339,8 @@ class BlueskySearchExtractor(BlueskyExtractor): example = "https://bsky.app/search?q=QUERY" def posts(self): - return self.api.search_posts(self.user) + query = text.unquote(self.user.replace("+", " ")) + return self.api.search_posts(query) class BlueskyHashtagExtractor(BlueskyExtractor): diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index f364124..5f9d355 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -348,7 +348,7 @@ class Extractor(): ssl_options = ssl_ciphers = 0 # .netrc Authorization headers are alwsays disabled - session.trust_env = True if self.config("proxy-env", False) else False + session.trust_env = True if self.config("proxy-env", True) else False browser = self.config("browser") if browser is None: @@ -387,8 +387,8 @@ class Extractor(): useragent = self.useragent elif useragent == "browser": useragent = _browser_useragent() - elif useragent is config.get(("extractor",), "user-agent") and \ - useragent == Extractor.useragent: + elif self.useragent is not Extractor.useragent and \ + useragent is config.get(("extractor",), "user-agent"): useragent = self.useragent headers["User-Agent"] = useragent headers["Accept"] = "*/*" diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index c3dfd91..37b6747 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -26,16 +26,7 @@ class DanbooruExtractor(BaseExtractor): def _init(self): self.ugoira = self.config("ugoira", False) self.external = self.config("external", False) - - includes = self.config("metadata") - if includes: - if isinstance(includes, (list, tuple)): - includes = ",".join(includes) - elif not isinstance(includes, str): - includes = "artist_commentary,children,notes,parent,uploader" - self.includes = includes + ",id" - else: - self.includes = False + self.includes = False threshold = self.config("threshold") if isinstance(threshold, int): @@ -56,6 +47,16 @@ class DanbooruExtractor(BaseExtractor): return pages * self.per_page def items(self): + # 'includes' initialization must be done here and not in '_init()' + # or it'll cause an exception with e621 when 'metadata' is enabled + includes = self.config("metadata") + if includes: + if isinstance(includes, (list, tuple)): + includes = ",".join(includes) + elif not isinstance(includes, str): + includes = "artist_commentary,children,notes,parent,uploader" + self.includes = includes + ",id" + data = self.metadata() for post in self.posts(): @@ -223,7 +224,7 @@ class DanbooruTagExtractor(DanbooruExtractor): else: prefix = None elif tag.startswith( - ("id:", "md5", "ordfav:", "ordfavgroup:", "ordpool:")): + ("id:", "md5:", "ordfav:", "ordfavgroup:", "ordpool:")): prefix = None break diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index aad5752..2c1174a 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -24,10 +24,6 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.user_id = self.config("user-id") self.root_api = self.config_instance("root-api") or self.root - if self.category == "realbooru": - self.items = self._items_realbooru - self._tags = self._tags_realbooru - def _api_request(self, params): url = self.root_api + "/index.php?page=dapi&s=post&q=index" return ElementTree.fromstring(self.request(url, params=params).text) @@ -82,16 +78,17 @@ class GelbooruV02Extractor(booru.BooruExtractor): params["pid"] = self.page_start * self.per_page data = {} + find_ids = re.compile(r"\sid=\"p(\d+)").findall + while True: - num_ids = 0 page = self.request(url, params=params).text + pids = find_ids(page) - for data["id"] in text.extract_iter(page, '" id="p', '"'): - num_ids += 1 + for data["id"] in pids: for post in self._api_request(data): yield post.attrib - if num_ids < self.per_page: + if len(pids) < self.per_page: return params["pid"] += self.per_page @@ -136,59 +133,8 @@ class GelbooruV02Extractor(booru.BooruExtractor): "body" : text.unescape(text.remove_html(extr(">", ""))), }) - def _file_url_realbooru(self, post): - url = post["file_url"] - md5 = post["md5"] - if md5 not in post["preview_url"] or url.count("/") == 5: - url = "{}/images/{}/{}/{}.{}".format( - self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) - return url - - def _items_realbooru(self): - from .common import Message - data = self.metadata() - - for post in self.posts(): - try: - html = self._html(post) - fallback = post["file_url"] - url = post["file_url"] = text.rextract( - html, 'href="', '"', html.index(">Original<"))[0] - except Exception: - self.log.debug("Unable to fetch download URL for post %s " - "(md5: %s)", post.get("id"), post.get("md5")) - continue - - text.nameext_from_url(url, post) - post.update(data) - self._prepare(post) - self._tags(post, html) - - path = url.rpartition("/")[0] - post["_fallback"] = ( - "{}/{}.{}".format(path, post["md5"], post["extension"]), - fallback, - ) - - yield Message.Directory, post - yield Message.Url, url, post - - def _tags_realbooru(self, post, page): - tag_container = text.extr(page, 'id="tagLink"', '') - tags = collections.defaultdict(list) - pattern = re.compile( - r'') + + post = { + "_html" : page, + "id" : post_id, + "rating" : "e" if rating == "adult" else (rating or "?")[0], + "tags" : text.unescape(extr(' alt="', '"')), + "file_url" : extr('src="', '"'), + "created_at": extr(">Posted at ", " by "), + "uploader" : extr(">", "<"), + "score" : extr('">', "<"), + "title" : extr('id="title" style="width: 100%;" value="', '"'), + "source" : extr('d="source" style="width: 100%;" value="', '"'), + } + + post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] + return post + + def skip(self, num): + self.page_start += num + return num + + def _prepare(self, post): + post["date"] = text.parse_datetime(post["created_at"], "%b, %d %Y") + + def _pagination(self, params, begin, end): + url = self.root + "/index.php" + params["pid"] = self.page_start + + while True: + page = self.request(url, params=params).text + + cnt = 0 + for post_id in text.extract_iter(page, begin, end): + cnt += 1 + yield self._parse_post(post_id) + + if cnt < self.per_page: + return + params["pid"] += self.per_page + + def _tags(self, post, _): + page = post["_html"] + tag_container = text.extr(page, 'id="tagLink"', '') + tags = collections.defaultdict(list) + pattern = re.compile( + r'Pool: ", "") + self.post_ids = text.extract_iter( + page, 'class="thumb" id="p', '"', pos) + + return { + "pool": text.parse_int(pool_id), + "pool_name": text.unescape(name), + } + + def posts(self): + return map( + self._parse_post, + util.advance(self.post_ids, self.page_start) + ) + + +class RealbooruPostExtractor(RealbooruExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" + example = "https://realbooru.com/index.php?page=post&s=view&id=12345" + + def posts(self): + return (self._parse_post(self.groups[0]),) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index f9b1a7f..4c4fb3a 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -145,6 +145,14 @@ class ZerochanTagExtractor(ZerochanExtractor): self.posts = self.posts_api self.session.headers["User-Agent"] = util.USERAGENT + exts = self.config("extensions") + if exts: + if isinstance(exts, str): + exts = exts.split(",") + self.exts = exts + else: + self.exts = ("jpg", "png", "webp", "gif") + def metadata(self): return {"search_tags": text.unquote( self.search_tag.replace("+", " "))} @@ -194,8 +202,6 @@ class ZerochanTagExtractor(ZerochanExtractor): "p" : self.page_start, } - static = "https://static.zerochan.net/.full." - while True: response = self.request(url, params=params, allow_redirects=False) @@ -221,15 +227,20 @@ class ZerochanTagExtractor(ZerochanExtractor): yield post else: for post in posts: - base = static + str(post["id"]) - post["file_url"] = base + ".jpg" - post["_fallback"] = (base + ".png",) + urls = self._urls(post) + post["file_url"] = next(urls) + post["_fallback"] = urls yield post if not data.get("next"): return params["p"] += 1 + def _urls(self, post, static="https://static.zerochan.net/.full."): + base = static + str(post["id"]) + "." + for ext in self.exts: + yield base + ext + class ZerochanImageExtractor(ZerochanExtractor): subcategory = "image" diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 2bf03f4..2dab0d6 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.28.0" +__version__ = "1.28.1" __variant__ = None -- cgit v1.2.3