# -*- coding: utf-8 -*-
# Copyright 2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://sizebooru.com/"""
from .booru import BooruExtractor
from .. import text
BASE_PATTERN = r"(?:https?://)?(?:www\.)?sizebooru\.com"
class SizebooruExtractor(BooruExtractor):
"""Base class for sizebooru extractors"""
category = "sizebooru"
root = "https://sizebooru.com"
filename_fmt = "{id}.{extension}"
archive_fmt = "{id}"
page_start = 1
request_interval = (0.5, 1.5)
def _init(self):
if self.config("metadata", False):
self._prepare = self._prepare_metadata
def _file_url(self, post):
post["file_url"] = url = f"{self.root}/Picture/{post['id']}"
return url
def _prepare(self, post):
post_id = post["id"]
post["id"] = text.parse_int(post_id)
post["filename"] = post_id
if not post["extension"]:
post["extension"] = "jpg"
def _prepare_metadata(self, post):
post_id = post["id"]
url = f"{self.root}/Details/{post_id}"
extr = text.extract_from(self.request(url).text)
post.update({
"id" : text.parse_int(post_id),
"date" : self.parse_datetime(
extr("Posted Date: ", "<"), "%m/%d/%Y"),
"date_approved": self.parse_datetime(
extr("Approved Date: ", "<"), "%m/%d/%Y"),
"approver" : text.remove_html(extr("Approved By:", "")),
"uploader" : text.remove_html(extr("Posted By:", "")),
"artist" : None
if (artist := extr("Artist: ", "")) == "N/A" else # noqa: E131 E501
text.remove_html(artist), # noqa: E131
"views" : text.parse_int(extr("Views:", "<")),
"source" : text.extr(extr(
"Source Link:", ""), ' href="', '"') or None,
"tags" : text.split_html(extr(
"
Related Tags
", "")),
"favorite" : text.split_html(extr(
"Favorited By
", "")),
})
post["filename"], _, ext = extr('" alt="', '"').rpartition(".")
if not post["extension"]:
post["extension"] = ext.lower()
return post
def _pagination(self, url, callback=None):
params = {
"pageNo" : self.page_start,
"pageSize": self.per_page,
}
page = self.request(url, params=params).text
if callback is not None:
callback(page)
while True:
thumb = None
for thumb in text.extract_iter(
page, '") or \
thumb is None:
return
params["pageNo"] += 1
page = self.request(url, params=params).text
class SizebooruPostExtractor(SizebooruExtractor):
"""Extractor for sizebooru posts"""
subcategory = "post"
pattern = BASE_PATTERN + r"/Details/(\d+)"
example = "https://sizebooru.com/Details/12345"
def posts(self):
return ({"id": self.groups[0], "extension": None},)
class SizebooruTagExtractor(SizebooruExtractor):
"""Extractor for sizebooru tag searches"""
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
pattern = BASE_PATTERN + r"/Search/([^/?#]+)"
example = "https://sizebooru.com/Search/TAG"
def posts(self):
tag = self.groups[0]
self.kwdict["search_tags"] = text.unquote(tag)
return self._pagination(f"{self.root}/Search/{tag}")
class SizebooruGalleryExtractor(SizebooruExtractor):
"""Extractor for sizebooru galleries"""
subcategory = "gallery"
directory_fmt = ("{category}", "{gallery_name} ({gallery_id})")
pattern = BASE_PATTERN + r"/Galleries/List/(\d+)"
example = "https://sizebooru.com/Galleries/List/123"
def posts(self):
gid = self.groups[0]
self.kwdict["gallery_id"] = text.parse_int(gid)
return self._pagination(
f"{self.root}/Galleries/List/{gid}", self._extract_name)
def _extract_name(self, page):
self.kwdict["gallery_name"] = text.unescape(text.extr(
page, "Gallery: ", " - Size Booru<"))
class SizebooruUserExtractor(SizebooruExtractor):
"""Extractor for a sizebooru user's uploads"""
subcategory = "user"
directory_fmt = ("{category}", "Uploads {user}")
pattern = BASE_PATTERN + r"/Profile/Uploads/([^/?#]+)"
example = "https://sizebooru.com/Profile/Uploads/USER"
def posts(self):
user = self.groups[0]
self.kwdict["user"] = text.unquote(user)
return self._pagination(f"{self.root}/Profile/Uploads/{user}",)
class SizebooruFavoriteExtractor(SizebooruExtractor):
"""Extractor for a sizebooru user's favorites"""
subcategory = "favorite"
directory_fmt = ("{category}", "Favorites {user}")
pattern = BASE_PATTERN + r"/Profile/Favorites/([^/?#]+)"
example = "https://sizebooru.com/Profile/Favorites/USER"
def posts(self):
user = self.groups[0]
self.kwdict["user"] = text.unquote(user)
return self._pagination(f"{self.root}/Profile/Favorites/{user}",)