summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/booru.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/booru.py')
-rw-r--r--gallery_dl/extractor/booru.py381
1 files changed, 191 insertions, 190 deletions
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index 0176d76..517df93 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -1,247 +1,248 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2020 Mike Fährmann
+# Copyright 2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Base classes for extractors for danbooru and co"""
+"""Extractors for *booru sites"""
+
+from .common import Extractor, Message, generate_extractors
+from .. import text, util, exception
-from .common import Extractor, Message, SharedConfigMixin
-from .. import text, exception
from xml.etree import ElementTree
import collections
-import datetime
-import operator
import re
-class BooruExtractor(SharedConfigMixin, Extractor):
- """Base class for all booru extractors"""
+class BooruExtractor(Extractor):
+ """Base class for *booru extractors"""
basecategory = "booru"
filename_fmt = "{category}_{id}_{md5}.{extension}"
- api_url = ""
- post_url = ""
- per_page = 50
- page_start = 1
- page_limit = None
- sort = False
+ page_start = 0
+ per_page = 100
- def __init__(self, match):
- super().__init__(match)
- self.params = {}
- self.extags = self.post_url and self.config("tags", False)
+ def items(self):
+ self.login()
+ extended_tags = self.config("tags", False)
+ data = self.metadata()
+ for post in self.posts():
+ try:
+ url = self._prepare_post(post, extended_tags)
+ except KeyError:
+ continue
+ post.update(data)
+ text.nameext_from_url(url, post)
+ yield Message.Directory, post
+ yield Message.Url, url, post
def skip(self, num):
pages = num // self.per_page
- if self.page_limit and pages + self.page_start > self.page_limit:
- pages = self.page_limit - self.page_start
self.page_start += pages
return pages * self.per_page
- def items(self):
- yield Message.Version, 1
- data = self.get_metadata()
+ def login(self):
+ """Login and set necessary cookies"""
- self.reset_page()
- while True:
- images = self.parse_response(
- self.request(self.api_url, params=self.params))
-
- for image in images:
- try:
- url = self.get_file_url(image)
- except KeyError:
- continue
- if url.startswith("/"):
- url = text.urljoin(self.api_url, url)
- image.update(data)
- text.nameext_from_url(url, image)
- if self.extags:
- self.extended_tags(image)
- yield Message.Directory, image
- yield Message.Url, url, image
-
- if len(images) < self.per_page:
- return
- self.update_page(image)
+ def metadata(self):
+ """Return a dict with general metadata"""
+ return ()
- def reset_page(self):
- """Initialize params to point to the first page"""
- self.params["page"] = self.page_start
+ def posts(self):
+ """Return an iterable with post objects"""
+ return ()
- def update_page(self, data):
- """Update params to point to the next page"""
+ def _prepare_post(self, post, extended_tags=False):
+ url = post["file_url"]
+ if url[0] == "/":
+ url = self.root + url
+ if extended_tags:
+ self._fetch_extended_tags(post)
+ post["date"] = text.parse_datetime(
+ post["created_at"], "%a %b %d %H:%M:%S %z %Y")
+ return url
- def parse_response(self, response):
- """Parse JSON API response"""
- images = response.json()
- if self.sort:
- images.sort(key=operator.itemgetter("score", "id"),
- reverse=True)
- return images
+ def _fetch_extended_tags(self, post, page=None):
+ if not page:
+ url = "{}/index.php?page=post&s=view&id={}".format(
+ self.root, post["id"])
+ page = self.request(url).text
+ html = text.extract(page, '<ul id="tag-', '</ul>')[0]
+ if html:
+ tags = collections.defaultdict(list)
+ pattern = re.compile(
+ r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
+ for tag_type, tag_name in pattern.findall(html):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ post["tags_" + key] = " ".join(value)
+
+ def _api_request(self, params):
+ url = self.root + "/index.php?page=dapi&s=post&q=index"
+ return ElementTree.fromstring(self.request(url, params=params).text)
+
+ def _pagination(self, params):
+ params["pid"] = self.page_start
+ params["limit"] = self.per_page
- def get_metadata(self):
- """Collect metadata for extractor-job"""
- return {}
+ while True:
+ root = self._api_request(params)
+ for post in root:
+ yield post.attrib
- @staticmethod
- def get_file_url(image):
- return image["file_url"]
+ if len(root) < self.per_page:
+ return
+ params["pid"] += 1
- def extended_tags(self, image, page=None):
- """Retrieve extended tag information"""
- if not page:
- url = self.post_url.format(image["id"])
- page = self.request(url).text
- tags = collections.defaultdict(list)
- tags_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
- pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
- for tag_type, tag_name in pattern.findall(tags_html or ""):
- tags[tag_type].append(text.unquote(tag_name))
- for key, value in tags.items():
- image["tags_" + key] = " ".join(value)
-
-
-class XmlParserMixin():
- """Mixin for XML based API responses"""
- def parse_response(self, response):
- root = ElementTree.fromstring(response.text)
- return [post.attrib for post in root]
-
-
-class MoebooruPageMixin():
- """Pagination for Moebooru and Danbooru v1"""
- def update_page(self, data):
- if self.page_limit:
- self.params["page"] = None
- self.params["before_id"] = data["id"]
- else:
- self.params["page"] += 1
-
-
-class GelbooruPageMixin():
- """Pagination for Gelbooru-like sites"""
- page_start = 0
- def reset_page(self):
- self.params["pid"] = self.page_start
+class BooruPostExtractor(BooruExtractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern_fmt = r"/index\.php\?page=post&s=view&id=(\d+)"
- def update_page(self, data):
- self.params["pid"] += 1
+ def __init__(self, match):
+ BooruExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+ def posts(self):
+ return self._pagination({"id": self.post_id})
-class TagMixin():
- """Extraction of images based on search-tags"""
+
+class BooruTagExtractor(BooruExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
+ pattern_fmt = r"/index\.php\?page=post&s=list&tags=([^&#]+)"
def __init__(self, match):
- super().__init__(match)
- self.tags = text.unquote(match.group("tags").replace("+", " "))
- self.params["tags"] = self.tags
- self.params["limit"] = self.per_page
+ BooruExtractor.__init__(self, match)
+ self.tags = text.unquote(match.group(1).replace("+", " "))
- def get_metadata(self):
+ def metadata(self):
return {"search_tags": self.tags}
+ def posts(self):
+ return self._pagination({"tags" : self.tags})
+
-class PoolMixin():
- """Extraction of image-pools"""
+class BooruPoolExtractor(BooruExtractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool}")
archive_fmt = "p_{pool}_{id}"
+ pattern_fmt = r"/index\.php\?page=pool&s=show&id=(\d+)"
def __init__(self, match):
- super().__init__(match)
- self.pool = match.group("pool")
- self.params["tags"] = "pool:" + self.pool
- self.params["limit"] = self.per_page
-
- def get_metadata(self):
- return {"pool": text.parse_int(self.pool)}
+ BooruExtractor.__init__(self, match)
+ self.pool_id = match.group(1)
+ self.post_ids = ()
+ def skip(self, num):
+ self.page_start += num
+ return num
-class GelbooruPoolMixin(PoolMixin):
- """Image-pool extraction for Gelbooru-like sites"""
- per_page = 1
+ def metadata(self):
+ url = "{}/index.php?page=pool&s=show&id={}".format(
+ self.root, self.pool_id)
+ page = self.request(url).text
- def get_metadata(self):
- page = self.request(self.pool_url.format(self.pool)).text
- name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
- if not name:
- name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
+ name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
if not name:
raise exception.NotFoundError("pool")
- self.posts = list(text.extract_iter(
- page, 'class="thumb" id="p', '"', pos))
+ self.post_ids = text.extract_iter(
+ page, 'class="thumb" id="p', '"', pos)
return {
- "pool": text.parse_int(self.pool),
+ "pool": text.parse_int(self.pool_id),
"pool_name": text.unescape(name),
- "count": len(self.posts),
}
- def reset_page(self):
- self.index = self.page_start
- self.update_page(None)
-
- def update_page(self, data):
- try:
- post = self.posts[self.index]
- self.index += 1
- except IndexError:
- post = "0"
- self.params["tags"] = "id:" + post
-
-
-class PostMixin():
- """Extraction of a single image-post"""
- subcategory = "post"
- archive_fmt = "{id}"
-
- def __init__(self, match):
- super().__init__(match)
- self.post = match.group("post")
- self.params["tags"] = "id:" + self.post
-
-
-class MoebooruPopularMixin():
- """Extraction and metadata handling for Moebooru and Danbooru v1"""
- subcategory = "popular"
- directory_fmt = ("{category}", "popular", "{scale}", "{date}")
- archive_fmt = "P_{scale[0]}_{date}_{id}"
- page_start = None
- sort = True
-
- def __init__(self, match):
- super().__init__(match)
- self.params.update(text.parse_query(match.group("query")))
- self.scale = match.group("scale")
-
- def get_metadata(self, fmt="%Y-%m-%d"):
- date = self.get_date() or datetime.date.today().isoformat()
- scale = self.get_scale() or "day"
-
- if scale == "week":
- date = datetime.date.fromisoformat(date)
- date = (date - datetime.timedelta(days=date.weekday())).isoformat()
- elif scale == "month":
- date = date[:-3]
-
- return {"date": date, "scale": scale}
-
- def get_date(self):
- if "year" in self.params:
- return "{:>04}-{:>02}-{:>02}".format(
- self.params["year"],
- self.params.get("month", "01"),
- self.params.get("day", "01"))
- return None
-
- def get_scale(self):
- if self.scale and self.scale.startswith("by_"):
- return self.scale[3:]
- return self.scale
+ def posts(self):
+ params = {}
+ for params["id"] in util.advance(self.post_ids, self.page_start):
+ for post in self._api_request(params):
+ yield post.attrib
+
+
+EXTRACTORS = {
+ "rule34": {
+ "root": "https://rule34.xxx",
+ "test-tag": (
+ ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
+ "count": 1,
+ }),
+ ),
+ "test-pool": (
+ ("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
+ "count": 3,
+ }),
+ ),
+ "test-post": (
+ ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "danraku",
+ "tags_character": "kashima_(kantai_collection)",
+ "tags_copyright": "kantai_collection",
+ "tags_general": str,
+ "tags_metadata": str,
+ },
+ }),
+ ),
+ },
+ "safebooru": {
+ "root": "https://safebooru.org",
+ "test-tag": (
+ ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
+ "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
+ "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
+ }),
+ ),
+ "test-pool": (
+ ("https://safebooru.org/index.php?page=pool&s=show&id=11", {
+ "count": 5,
+ }),
+ ),
+ "test-post": (
+ ("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
+ "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
+ "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "kawanakajima",
+ "tags_character": "heath_ledger ronald_mcdonald the_joker",
+ "tags_copyright": "dc_comics mcdonald's the_dark_knight",
+ "tags_general": str,
+ },
+ }),
+ ),
+ },
+ "realbooru": {
+ "root": "https://realbooru.com",
+ "test-tag": (
+ ("https://realbooru.com/index.php?page=post&s=list&tags=wine", {
+ "count": ">= 64",
+ }),
+ ),
+ "test-pool": (
+ ("https://realbooru.com/index.php?page=pool&s=show&id=1", {
+ "count": 3,
+ }),
+ ),
+ "test-post": (
+ ("https://realbooru.com/index.php?page=post&s=view&id=668483", {
+ "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
+ "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
+ }),
+ ),
+ },
+}
+
+generate_extractors(EXTRACTORS, globals(), (
+ BooruTagExtractor,
+ BooruPoolExtractor,
+ BooruPostExtractor,
+))