# -*- coding: utf-8 -*-
# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://realbooru.com/"""
from . import booru
from .. import text, util
import collections
BASE_PATTERN = r"(?:https?://)?realbooru\.com"
class RealbooruExtractor(booru.BooruExtractor):
basecategory = "booru"
category = "realbooru"
root = "https://realbooru.com"
def _parse_post(self, post_id):
url = f"{self.root}/index.php?page=post&s=view&id={post_id}"
page = self.request(url).text
extr = text.extract_from(page)
rating = extr('name="rating" content="', '"')
extr('class="container"', '>')
post = {
"id" : post_id,
"rating" : "e" if rating == "adult" else (rating or "?")[0],
"file_url" : (s := extr('src="', '"')),
"_fallback" : (extr('src="', '"'),) if s.endswith(".mp4") else (),
"created_at": extr(">Posted at ", " by "),
"uploader" : extr(">", "<"),
"score" : extr('">', "<"),
"tags" : extr('
', ""),
"title" : extr('id="title" style="width: 100%;" value="', '"'),
"source" : extr('d="source" style="width: 100%;" value="', '"'),
}
tags_container = post["tags"]
tags = []
tags_categories = collections.defaultdict(list)
pattern = text.re(r'Pool: ", "")
self.post_ids = text.extract_iter(
page, 'class="thumb" id="p', '"', pos)
return {
"pool": text.parse_int(pool_id),
"pool_name": text.unescape(name),
}
def posts(self):
return map(
self._parse_post,
util.advance(self.post_ids, self.page_start)
)
class RealbooruPostExtractor(RealbooruExtractor):
subcategory = "post"
archive_fmt = "{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
example = "https://realbooru.com/index.php?page=post&s=view&id=12345"
def posts(self):
return (self._parse_post(self.groups[0]),)