diff options
Diffstat (limited to 'gallery_dl/extractor/gelbooru.py')
| -rw-r--r-- | gallery_dl/extractor/gelbooru.py | 111 |
1 files changed, 26 insertions, 85 deletions
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index c32ba5c..b0614e2 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -6,98 +6,27 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://gelbooru.com/""" +"""Extractors for https://gelbooru.com/""" from . import booru -from .common import Message -from .. import text +from .. import text, exception -class GelbooruExtractor(booru.XmlParserMixin, - booru.GelbooruPageMixin, - booru.BooruExtractor): +class GelbooruBase(): """Base class for gelbooru extractors""" category = "gelbooru" - api_url = "https://gelbooru.com/index.php" - post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}" - pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}" + root = "https://gelbooru.com" - def __init__(self, match): - super().__init__(match) - - self.use_api = self.config("api", True) - if self.use_api: - self.params.update({"page": "dapi", "s": "post", "q": "index"}) - else: - self.items = self.items_noapi - self.session.cookies["fringeBenefits"] = "yup" - self.per_page = 42 - - @staticmethod - def get_file_url(image): - url = image["file_url"] + def _prepare_post(self, post, extended_tags=False): + url = booru.BooruExtractor._prepare_post(self, post, extended_tags) if url.startswith("https://mp4.gelbooru.com/"): - ihash = image["md5"] + md5 = post["md5"] return "https://img2.gelbooru.com/images/{}/{}/{}.webm".format( - ihash[0:2], ihash[2:4], ihash) + md5[0:2], md5[2:4], md5) return url - def items_noapi(self): - yield Message.Version, 1 - data = self.get_metadata() - - for post in self.get_posts(): - post = self.get_post_data(post) - url = post["file_url"] - post.update(data) - text.nameext_from_url(url, post) - yield Message.Directory, post - yield Message.Url, url, post - - def get_posts(self): - """Return an iterable containing all relevant post objects""" - url = "https://gelbooru.com/index.php?page=post&s=list" - params = { - "tags": self.params["tags"], - "pid" : self.page_start * self.per_page - } - - while True: - page = self.request(url, params=params).text - ids = list(text.extract_iter(page, '<span id="s', '"')) - yield from ids - if len(ids) < self.per_page: - return - params["pid"] += self.per_page - - def get_post_data(self, post_id): - """Extract metadata of a single post""" - page = self.request(self.post_url.format(post_id)).text - data = text.extract_all(page, ( - (None , '<meta name="keywords"', ''), - ("tags" , ' imageboard- ', '"'), - ("id" , '<li>Id: ', '<'), - ("created_at", '<li>Posted: ', '<'), - ("width" , '<li>Size: ', 'x'), - ("height" , '', '<'), - ("source" , '<li>Source: <a href="', '"'), - ("rating" , '<li>Rating: ', '<'), - (None , '<li>Score: ', ''), - ("score" , '>', '<'), - ("file_url" , '<li><a href="http', '"'), - ("change" , ' id="lupdated" value="', '"'), - ))[0] - data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1) - data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0] - data["rating"] = (data["rating"] or "?")[0].lower() - data["tags"] = " ".join( - [tag.replace(" ", "_") for tag in data["tags"].split(", ")]) - if self.extags: - self.extended_tags(data, page) - return data - -class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor): +class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor): """Extractor for images from gelbooru.com based on search-tags""" pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=post&s=list&tags=(?P<tags>[^&#]+)") @@ -112,7 +41,7 @@ class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor): ) -class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor): +class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor): """Extractor for image-pools from gelbooru.com""" pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=pool&s=show&id=(?P<pool>\d+)") @@ -126,8 +55,23 @@ class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor): }), ) + def metadata(self): + url = "{}/index.php?page=pool&s=show&id={}".format( + self.root, self.pool_id) + page = self.request(url).text + + name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>") + if not name: + raise exception.NotFoundError("pool") + self.post_ids = text.extract_iter(page, 'class="" id="p', '"', pos) + + return { + "pool": text.parse_int(self.pool_id), + "pool_name": text.unescape(name), + } + -class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor): +class GelbooruPostExtractor(GelbooruBase, booru.BooruPostExtractor): """Extractor for single images from gelbooru.com""" pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=post&s=view&id=(?P<post>\d+)") @@ -135,6 +79,3 @@ class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor): "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", "count": 1, }) - - def get_posts(self): - return (self.post,) |
