summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/gelbooru.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/gelbooru.py')
-rw-r--r--gallery_dl/extractor/gelbooru.py111
1 files changed, 26 insertions, 85 deletions
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index c32ba5c..b0614e2 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -6,98 +6,27 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://gelbooru.com/"""
+"""Extractors for https://gelbooru.com/"""
from . import booru
-from .common import Message
-from .. import text
+from .. import text, exception
-class GelbooruExtractor(booru.XmlParserMixin,
- booru.GelbooruPageMixin,
- booru.BooruExtractor):
+class GelbooruBase():
"""Base class for gelbooru extractors"""
category = "gelbooru"
- api_url = "https://gelbooru.com/index.php"
- post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}"
- pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}"
+ root = "https://gelbooru.com"
- def __init__(self, match):
- super().__init__(match)
-
- self.use_api = self.config("api", True)
- if self.use_api:
- self.params.update({"page": "dapi", "s": "post", "q": "index"})
- else:
- self.items = self.items_noapi
- self.session.cookies["fringeBenefits"] = "yup"
- self.per_page = 42
-
- @staticmethod
- def get_file_url(image):
- url = image["file_url"]
+ def _prepare_post(self, post, extended_tags=False):
+ url = booru.BooruExtractor._prepare_post(self, post, extended_tags)
if url.startswith("https://mp4.gelbooru.com/"):
- ihash = image["md5"]
+ md5 = post["md5"]
return "https://img2.gelbooru.com/images/{}/{}/{}.webm".format(
- ihash[0:2], ihash[2:4], ihash)
+ md5[0:2], md5[2:4], md5)
return url
- def items_noapi(self):
- yield Message.Version, 1
- data = self.get_metadata()
-
- for post in self.get_posts():
- post = self.get_post_data(post)
- url = post["file_url"]
- post.update(data)
- text.nameext_from_url(url, post)
- yield Message.Directory, post
- yield Message.Url, url, post
-
- def get_posts(self):
- """Return an iterable containing all relevant post objects"""
- url = "https://gelbooru.com/index.php?page=post&s=list"
- params = {
- "tags": self.params["tags"],
- "pid" : self.page_start * self.per_page
- }
-
- while True:
- page = self.request(url, params=params).text
- ids = list(text.extract_iter(page, '<span id="s', '"'))
- yield from ids
- if len(ids) < self.per_page:
- return
- params["pid"] += self.per_page
-
- def get_post_data(self, post_id):
- """Extract metadata of a single post"""
- page = self.request(self.post_url.format(post_id)).text
- data = text.extract_all(page, (
- (None , '<meta name="keywords"', ''),
- ("tags" , ' imageboard- ', '"'),
- ("id" , '<li>Id: ', '<'),
- ("created_at", '<li>Posted: ', '<'),
- ("width" , '<li>Size: ', 'x'),
- ("height" , '', '<'),
- ("source" , '<li>Source: <a href="', '"'),
- ("rating" , '<li>Rating: ', '<'),
- (None , '<li>Score: ', ''),
- ("score" , '>', '<'),
- ("file_url" , '<li><a href="http', '"'),
- ("change" , ' id="lupdated" value="', '"'),
- ))[0]
- data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1)
- data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0]
- data["rating"] = (data["rating"] or "?")[0].lower()
- data["tags"] = " ".join(
- [tag.replace(" ", "_") for tag in data["tags"].split(", ")])
- if self.extags:
- self.extended_tags(data, page)
- return data
-
-class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
+class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor):
"""Extractor for images from gelbooru.com based on search-tags"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
@@ -112,7 +41,7 @@ class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
)
-class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):
+class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor):
"""Extractor for image-pools from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=pool&s=show&id=(?P<pool>\d+)")
@@ -126,8 +55,23 @@ class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):
}),
)
+ def metadata(self):
+ url = "{}/index.php?page=pool&s=show&id={}".format(
+ self.root, self.pool_id)
+ page = self.request(url).text
+
+ name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
+ if not name:
+ raise exception.NotFoundError("pool")
+ self.post_ids = text.extract_iter(page, 'class="" id="p', '"', pos)
+
+ return {
+ "pool": text.parse_int(self.pool_id),
+ "pool_name": text.unescape(name),
+ }
+
-class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
+class GelbooruPostExtractor(GelbooruBase, booru.BooruPostExtractor):
"""Extractor for single images from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=view&id=(?P<post>\d+)")
@@ -135,6 +79,3 @@ class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
"count": 1,
})
-
- def get_posts(self):
- return (self.post,)