1 files changed, 26 insertions, 85 deletions
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index c32ba5c..b0614e2 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -6,98 +6,27 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 
-"""Extract images from https://gelbooru.com/"""
+"""Extractors for https://gelbooru.com/"""
 
 from . import booru
-from .common import Message
-from .. import text
+from .. import text, exception
 
 
-class GelbooruExtractor(booru.XmlParserMixin,
-                        booru.GelbooruPageMixin,
-                        booru.BooruExtractor):
+class GelbooruBase():
     """Base class for gelbooru extractors"""
     category = "gelbooru"
-    api_url = "https://gelbooru.com/index.php"
-    post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}"
-    pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}"
+    root = "https://gelbooru.com"
 
-    def __init__(self, match):
-        super().__init__(match)
-
-        self.use_api = self.config("api", True)
-        if self.use_api:
-            self.params.update({"page": "dapi", "s": "post", "q": "index"})
-        else:
-            self.items = self.items_noapi
-            self.session.cookies["fringeBenefits"] = "yup"
-            self.per_page = 42
-
-    @staticmethod
-    def get_file_url(image):
-        url = image["file_url"]
+    def _prepare_post(self, post, extended_tags=False):
+        url = booru.BooruExtractor._prepare_post(self, post, extended_tags)
         if url.startswith("https://mp4.gelbooru.com/"):
-            ihash = image["md5"]
+            md5 = post["md5"]
             return "https://img2.gelbooru.com/images/{}/{}/{}.webm".format(
-                ihash[0:2], ihash[2:4], ihash)
+                md5[0:2], md5[2:4], md5)
         return url
 
-    def items_noapi(self):
-        yield Message.Version, 1
-        data = self.get_metadata()
-
-        for post in self.get_posts():
-            post = self.get_post_data(post)
-            url = post["file_url"]
-            post.update(data)
-            text.nameext_from_url(url, post)
-            yield Message.Directory, post
-            yield Message.Url, url, post
-
-    def get_posts(self):
-        """Return an iterable containing all relevant post objects"""
-        url = "https://gelbooru.com/index.php?page=post&s=list"
-        params = {
-            "tags": self.params["tags"],
-            "pid" : self.page_start * self.per_page
-        }
-
-        while True:
-            page = self.request(url, params=params).text
-            ids = list(text.extract_iter(page, '<span id="s', '"'))
-            yield from ids
-            if len(ids) < self.per_page:
-                return
-            params["pid"] += self.per_page
-
-    def get_post_data(self, post_id):
-        """Extract metadata of a single post"""
-        page = self.request(self.post_url.format(post_id)).text
-        data = text.extract_all(page, (
-            (None        , '<meta name="keywords"', ''),
-            ("tags"      , ' imageboard- ', '"'),
-            ("id"        , '<li>Id: ', '<'),
-            ("created_at", '<li>Posted: ', '<'),
-            ("width"     , '<li>Size: ', 'x'),
-            ("height"    , '', '<'),
-            ("source"    , '<li>Source: <a href="', '"'),
-            ("rating"    , '<li>Rating: ', '<'),
-            (None        , '<li>Score: ', ''),
-            ("score"     , '>', '<'),
-            ("file_url"  , '<li><a href="http', '"'),
-            ("change"    , ' id="lupdated" value="', '"'),
-        ))[0]
-        data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1)
-        data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0]
-        data["rating"] = (data["rating"] or "?")[0].lower()
-        data["tags"] = " ".join(
-            [tag.replace(" ", "_") for tag in data["tags"].split(", ")])
-        if self.extags:
-            self.extended_tags(data, page)
-        return data
 
-
-class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
+class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor):
     """Extractor for images from gelbooru.com based on search-tags"""
     pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
                r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
@@ -112,7 +41,7 @@ class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
     )
 
 
-class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):
+class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor):
     """Extractor for image-pools from gelbooru.com"""
     pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
                r"\?page=pool&s=show&id=(?P<pool>\d+)")
@@ -126,8 +55,23 @@ class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):
         }),
     )
 
+    def metadata(self):
+        url = "{}/index.php?page=pool&s=show&id={}".format(
+            self.root, self.pool_id)
+        page = self.request(url).text
+
+        name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
+        if not name:
+            raise exception.NotFoundError("pool")
+        self.post_ids = text.extract_iter(page, 'class="" id="p', '"', pos)
+
+        return {
+            "pool": text.parse_int(self.pool_id),
+            "pool_name": text.unescape(name),
+        }
+
 
-class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
+class GelbooruPostExtractor(GelbooruBase, booru.BooruPostExtractor):
     """Extractor for single images from gelbooru.com"""
     pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
                r"\?page=post&s=view&id=(?P<post>\d+)")
@@ -135,6 +79,3 @@ class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
         "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
         "count": 1,
     })
-
-    def get_posts(self):
-        return (self.post,)