diff options
Diffstat (limited to 'gallery_dl/extractor/gelbooru_v02.py')
| -rw-r--r-- | gallery_dl/extractor/gelbooru_v02.py | 73 |
1 files changed, 35 insertions, 38 deletions
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 2c1174a..c12a7a2 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,10 +10,7 @@ from . import booru from .. import text, util, exception - -from xml.etree import ElementTree import collections -import re class GelbooruV02Extractor(booru.BooruExtractor): @@ -24,9 +21,12 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.user_id = self.config("user-id") self.root_api = self.config_instance("root-api") or self.root + if self.category == "rule34": + self._file_url = self._file_url_rule34 + def _api_request(self, params): url = self.root_api + "/index.php?page=dapi&s=post&q=index" - return ElementTree.fromstring(self.request(url, params=params).text) + return self.request_xml(url, params=params) def _pagination(self, params): params["pid"] = self.page_start @@ -38,7 +38,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): while True: try: root = self._api_request(params) - except ElementTree.ParseError: + except SyntaxError: # ElementTree.ParseError if "tags" not in params or post is None: raise taglist = [tag for tag in params["tags"].split() @@ -50,7 +50,9 @@ class GelbooruV02Extractor(booru.BooruExtractor): if total is None: try: - total = int(root.attrib["count"]) + self.kwdict["total"] = total = int(root.attrib["count"]) + if "search_tags" in self.kwdict: + self.kwdict["search_count"] = total self.log.debug("%s posts in total", total) except Exception as exc: total = 0 @@ -78,7 +80,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): params["pid"] = self.page_start * self.per_page data = {} - find_ids = re.compile(r"\sid=\"p(\d+)").findall + find_ids = util.re(r"\sid=\"p(\d+)").findall while True: page = self.request(url, params=params).text @@ -92,15 +94,24 @@ class GelbooruV02Extractor(booru.BooruExtractor): return params["pid"] += self.per_page - @staticmethod - def _prepare(post): + def _file_url_rule34(self, post): + url = post["file_url"] + + if text.ext_from_url(url) not in util.EXTS_VIDEO: + path = url.partition(".")[2] + post["_fallback"] = (url,) + post["file_url"] = url = "https://wimg." + path + + return url + + def _prepare(self, post): post["tags"] = post["tags"].strip() post["date"] = text.parse_datetime( post["created_at"], "%a %b %d %H:%M:%S %z %Y") def _html(self, post): - return self.request("{}/index.php?page=post&s=view&id={}".format( - self.root, post["id"])).text + url = f"{self.root}/index.php?page=post&s=view&id={post['id']}" + return self.request(url).text def _tags(self, post, page): tag_container = (text.extr(page, '<ul id="tag-', '</ul>') or @@ -109,8 +120,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): return tags = collections.defaultdict(list) - pattern = re.compile( - r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S) + pattern = util.re(r"(?s)tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)") for tag_type, tag_name in pattern.findall(tag_container): tags[tag_type].append(text.unescape(text.unquote(tag_name))) for key, value in tags.items(): @@ -166,18 +176,13 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor): pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)" example = "https://safebooru.org/index.php?page=post&s=list&tags=TAG" - def __init__(self, match): - GelbooruV02Extractor.__init__(self, match) - tags = match.group(match.lastindex) - self.tags = text.unquote(tags.replace("+", " ")) - - def metadata(self): - return {"search_tags": self.tags} - def posts(self): - if self.tags == "all": - self.tags = "" - return self._pagination({"tags": self.tags}) + self.kwdict["search_tags"] = tags = text.unquote( + self.groups[-1].replace("+", " ")) + + if tags == "all": + tags = "" + return self._pagination({"tags": tags}) class GelbooruV02PoolExtractor(GelbooruV02Extractor): @@ -189,7 +194,7 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor): def __init__(self, match): GelbooruV02Extractor.__init__(self, match) - self.pool_id = match.group(match.lastindex) + self.pool_id = self.groups[-1] if self.category == "rule34": self.posts = self._posts_pages @@ -202,8 +207,7 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor): return num def metadata(self): - url = "{}/index.php?page=pool&s=show&id={}".format( - self.root, self.pool_id) + url = f"{self.root}/index.php?page=pool&s=show&id={self.pool_id}" page = self.request(url).text name, pos = text.extract(page, "<h4>Pool: ", "</h4>") @@ -239,12 +243,9 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)" example = "https://safebooru.org/index.php?page=favorites&s=view&id=12345" - def __init__(self, match): - GelbooruV02Extractor.__init__(self, match) - self.favorite_id = match.group(match.lastindex) - def metadata(self): - return {"favorite_id": text.parse_int(self.favorite_id)} + self.favorite_id = fav_id = self.groups[-1] + return {"favorite_id": text.parse_int(fav_id)} def posts(self): return self._pagination_html({ @@ -260,9 +261,5 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor): pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" example = "https://safebooru.org/index.php?page=post&s=view&id=12345" - def __init__(self, match): - GelbooruV02Extractor.__init__(self, match) - self.post_id = match.group(match.lastindex) - def posts(self): - return self._pagination({"id": self.post_id}) + return self._pagination({"id": self.groups[-1]}) |
